Ejemplo n.º 1
0
def writeLexedFile(outputDir, fileExtension, lexedWoComments, fileid, flag,
                   explicitWrite):
    #print(lexedWoComments)
    assert (len(lexedWoComments) <= MAX_SIZE)
    #Uncomment if you want to copy the actual java file too.
    #Commented out to save space.
    #call(["cp", path, outputDir + "/" + str(i) + "." + fileExtension[2:]])
    # Write to file
    #Output format must be single line and be the input file name + .tokens.
    with open(
            outputDir + "/" + str(fileid) + "." + fileExtension[2:] +
            ".tokens", "wb") as outputFile:
        writer = UnicodeWriter(outputFile)
        for t in lexedWoComments:
            token = t[1]
            noWS = token.strip()
            noWS = noWS.replace('\n', '')  #Remove new lines
            noWS = noWS.replace(' ',
                                '_')  #Replace any remaining internal spaces
            if (noWS == ""):
                continue
            #if((t[0] != Token.Literal.String) and "." in noWS):
            #    noWS = " ".join(re.split("(\.)", noWS)).strip()

            if (flag == "labelled" or flag == "android" or flag == "api"):
                if (explicitWrite == True):
                    noWS = "<" + noWS + "|" + str(t[0]) + ">"
            #if(flag == "labelled" or flag == "android"):
            #    if(t[0] == Token.Name.Namespace):
            #        noWS = convertNamespaceToken(noWS, "Token.Name.Namespace")
            #    elif(t[0] == Android.Namespace):
            #        noWS = convertNamespaceToken(noWS, "Android.Namespace")
            #    else:
            #        noWS = "<" + noWS + "|" + str(t[0]) + ">"
            #elif(flag == "name"):
            #    noWS = noWS.replace('.', '')
            outputFile.write(noWS.encode("utf-8"))
            #outputFile.write(noWS)
            outputFile.write(' ')
        outputFile.write(
            '\n'
        )  #Without a new line between each file, there can be a problem with the SRILM ngram tools?
Ejemplo n.º 2
0
    if result[1] is not None:
        file_name, ok, candidates = result

        orig.setdefault(file_name, {})

        for (def_scope, name, glb) in candidates:
            orig[file_name][def_scope] = (name, glb)

#             print file_name, (name, glb)
#             orig[file_name].setdefault(def_scope, [])
#             orig[file_name][def_scope].append(name)

    else:
        print result[0], result[2]

writer = UnicodeWriter(open(os.path.join(results_path, 'stats.csv'), 'w'))
writer.writerow(
    ['file', 'num_names', 'num_glb_names', 'num_loc_names'] +
    [n2s[i].replace('.', '_') for i in range(len(strategies))] +
    [n2s[i].replace('.', '_') + '_all' for i in range(len(strategies))] +
    [n2s[i].replace('.', '_') + '_glb' for i in range(len(strategies))] +
    [n2s[i].replace('.', '_') + '_maybe' for i in range(len(strategies))])

for file_name in orig.iterkeys():
    row = [file_name]
    counts_loc = [0] * len(strategies)
    counts_glb = [0] * len(strategies)
    counts = [0] * len(strategies)
    alt_counts = [0] * len(strategies)

    num_names = 0
Ejemplo n.º 3
0
output_path = Folder(sys.argv[3]).create()
num_threads = int(sys.argv[4])

flog = 'log_' + os.path.basename(training_sample_path)

try:
    for f in [flog]:
        os.remove(os.path.join(output_path, f))
except:
    pass


with open(training_sample_path, 'r') as f, \
        open(os.path.join(output_path, flog), 'w') as g:

    reader = UnicodeReader(f)
    writer = UnicodeWriter(g)

    pool = multiprocessing.Pool(processes=num_threads)

    for result in pool.imap_unordered(processFile, reader):

        if result[1]:
            (js_file_path, ok, msg) = result

            writer.writerow([js_file_path, msg])

        else:
            writer.writerow([result[0], result[2]])
Ejemplo n.º 4
0
try:
    for f in [f1, f2, f5, f6, glog]:
        os.remove(os.path.join(output_path, f))
except:
    pass

with open(os.path.join(output_path, glog), 'w') as g, \
        open(os.path.join(output_path, f1), 'w') as f_orig, \
        open(os.path.join(output_path, f2), 'w') as f_no_renaming, \
        open(os.path.join(output_path, f5), 'w') as f_hash_def_one_renaming, \
        open(os.path.join(output_path, f6), 'w') as f_hash_def_two_renaming:
    #         open(os.path.join(output_path, f3), 'w') as f_basic_renaming, \
    #         open(os.path.join(output_path, f4), 'w') as f_normalized, \

    writer = UnicodeWriter(g)

    pool = multiprocessing.Pool(processes=num_threads)

    for result in pool.imap_unordered(processFile, corpus_sample):

        if result[1] is not None:
            (
                js_file_path,
                orig,
                no_renaming,
                #              basic_renaming,
                #              normalized,
                hash_def_one_renaming,
                hash_def_two_renaming) = result
Ejemplo n.º 5
0
        if ok:
            mc = MiniChecker('tmp_%d.b.js' % pid)
            try:
                isMini = mc.compare(keep_mini=False)
            except Exception as e:
                isMini = str(e)
                
            cleanup(pid)
            return [os.path.basename(js_file_path), isMini]
        
        else:
            cleanup(pid)
            return [os.path.basename(js_file_path), 'Beautifier failed']
        
    except TimeExceededError:
        
        cleanup(pid)
        return [os.path.basename(js_file_path), 'Timeout']
        

    
    
corpus_dir = Folder(sys.argv[1])

pool = multiprocessing.Pool(processes=8)
with open('isMinified.csv', 'wb') as f:
    writer = UnicodeWriter(f)
    for line in pool.imap(processFile, corpus_dir.fullFileNames("*.js")):
        writer.writerow(line)

Ejemplo n.º 6
0
    RS = RenamingStrategies()

    proxies = MosesProxy().getProxies()

    with open(testing_sample_path, 'r') as f:

        reader = UnicodeReader(f)

        pool = multiprocessing.Pool(processes=num_threads)

        for result in pool.imap_unordered(processFile, reader):

            with open(os.path.join(output_path, flog), 'a') as g, \
                        open(os.path.join(output_path, c_path), 'a') as c, \
                            open(os.path.join(output_path, s_path), 'a') as sM:
                writer = UnicodeWriter(g)
                cw = UnicodeWriter(c)
                sM_writer = UnicodeWriter(sM)

                if result[1] is not None:
                    js_file_path, ok, candidates, model_rows = result

                    writer.writerow([js_file_path, ok])

                    for r in candidates:
                        cw.writerow([js_file_path] +
                                    [str(x).replace("\"", "") for x in r])

                    for row in model_rows:
                        sM_writer.writerow([js_file_path] + row)
                else:
Ejemplo n.º 7
0
corpus_root = os.path.abspath(sys.argv[2])

output_path = Folder(sys.argv[3]).create()
num_threads = int(sys.argv[4])


with open(sample_path, 'r') as f:

    reader = UnicodeReader(f)
    
    flog = 'log_js_functions_' + os.path.basename(sample_path)
    try:
        for f in [flog]:
            os.remove(os.path.join(output_path, f))
    except:
        pass
    
    pool = multiprocessing.Pool(processes=num_threads)

    for result in pool.imap_unordered(processFile, reader):
      
        with open(os.path.join(output_path, flog), 'a') as g:
            writer = UnicodeWriter(g)
            
            if result[1] is not None:
                writer.writerow(result)
            else:
                writer.writerow([result[0], result[2]])
                
        
Ejemplo n.º 8
0
PREFIX_NAME = 'PREFIX_NAME'
LOGIN_NAME = 'LOGIN_NAME'
FULL_NAME = 'FULL_NAME'
SIMPLE_NAME = 'SIMPLE_NAME'
LOCATION = 'LOCATION'
DOMAIN = 'EMAIL_DOMAIN'
TWO = 'TWO_OR_MORE_RULES'

THR_MIN = 1
THR_MAX = 10

unmask = {}

dataPath = os.path.abspath('../data')

w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_log.csv'), 'wb'))
writer = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_map.csv'),
                            'wb'))
w_maybe = UnicodeWriter(
    open(os.path.join(dataPath, 'idm', 'idm_maybe.csv'), 'wb'))

idx = 0
step = 100000
curidx = step

aliases = {}

#    reader = UnicodeReader(open(os.path.join(dataPath, 'users_clean_emails_sample.csv'), 'rb'))
reader = UnicodeReader(
    open(os.path.join(dataPath, 'active_prolific_users.csv'), 'rb'))
_header = reader.next()
PREFIX_NAME         = 'PREFIX_NAME'
LOGIN_NAME          = 'LOGIN_NAME'
FULL_NAME           = 'FULL_NAME'
SIMPLE_NAME         = 'SIMPLE_NAME'
LOCATION            = 'LOCATION'
DOMAIN              = 'EMAIL_DOMAIN'
TWO                 = 'TWO_OR_MORE_RULES'

THR_MIN = 1
THR_MAX = 10

unmask = {}

dataPath = os.path.abspath('../../data/2014-01')

w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_log.csv'), 'wb'))
writer = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_map.csv'), 'wb'))
w_maybe = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_maybe.csv'), 'wb'))

idx = 0
step = 100000
curidx = step

aliases = {}

#    reader = UnicodeReader(open(os.path.join(dataPath, 'users_clean_emails_sample.csv'), 'rb'))
reader = UnicodeReader(open(os.path.join(dataPath, 'clean', 'users_clean_emails.csv'), 'rb'))
_header = reader.next()

# Helper structures
d_email_uid = {}
Ejemplo n.º 10
0
]

size = len(eligible)
tt = int(0.8 * size)
training_size = int(0.9 * tt)
tuning_size = int(tt - training_size)
testing_size = size - tt

print 'Total:', size
print 'Training:', training_size
print 'Tuning:', tuning_size
print 'Testing:', testing_size

training_sample = random.sample(eligible, training_size)
with open('trainingSample.csv', 'wb') as of:
    writer = UnicodeWriter(of)
    for f in training_sample:
        writer.writerow([f])

tuning_sample = random.sample(
    set(eligible).difference(set(training_sample)), tuning_size)
with open('tuningSample.csv', 'wb') as of:
    writer = UnicodeWriter(of)
    for f in tuning_sample:
        writer.writerow([f])

testing_sample = random.sample(
    set(eligible).difference(set(training_sample)).difference(
        set(tuning_sample)), testing_size)
with open('testingSample.csv', 'wb') as of:
    writer = UnicodeWriter(of)
Ejemplo n.º 11
0
        os.remove(os.path.join(output_path, f))
except:
    pass

# inputs = Folder(corpus_root).fullFileNames("*.js")

with open(testing_sample_path, 'r') as f:

    reader = UnicodeReader(f)

    #     result = processFile(reader.next())

    pool = multiprocessing.Pool(processes=num_threads)

    for result in pool.imap_unordered(processFile, reader):
        #     if True:

        with open(os.path.join(output_path, flog), 'a') as g, \
                open(os.path.join(output_path, c_path), 'a') as c:
            writer = UnicodeWriter(g)
            cw = UnicodeWriter(c)

            if result[1] is not None:
                js_file_path, ok, candidates = result
                writer.writerow([js_file_path, ok])
                for r in candidates:
                    cw.writerow([js_file_path] +
                                [str(x).replace("\"", "") for x in list(r)])
            else:
                writer.writerow([result[0], result[2]])
Ejemplo n.º 12
0
num_threads = int(sys.argv[3])

flog = 'log_sanity'
try:
    for f in [flog]:
        os.remove(os.path.join(results_root, f))
except:
    pass

with open(file_list_path, 'r') as f:

    reader = UnicodeReader(f)

    #     result = processFile(reader.next())

    pool = multiprocessing.Pool(processes=num_threads)

    for result in pool.imap_unordered(processFile, reader):
        #     for row in reader:
        #         result = processFile(row)
        #         print 'result', result

        with open(os.path.join(results_root, flog), 'a') as g:
            writer = UnicodeWriter(g)

            if result[1] is not None:
                js_file_path, ok, method = result
                writer.writerow([js_file_path, ok, method])
            else:
                writer.writerow([result[0], result[2], ''])
Ejemplo n.º 13
0
input_dir = sys.argv[1]
output_csv = sys.argv[2]
num_threads = int(sys.argv[3])

base_dir = Folder(input_dir)
fileList = base_dir.baseFileNames("*.js")
origList = [next for next in fileList if next.count(".") == 1]
toProcess = [(nextFile, os.path.join(base_dir.path, nextFile))
             for nextFile in origList]

#print(fileList)
with open(output_csv, 'w') as g:
    pass
pool = multiprocessing.Pool(processes=num_threads)

for result in pool.imap_unordered(processFile, toProcess):

    #for file_pair in toProcess:
    #    result = processFile(file_pair)
    print(result[0])
    with open(output_csv, 'a') as g:
        writer = UnicodeWriter(g)

        if result[1] is not None:
            writer.writerow(result)
        else:
            writer.writerow([result[0], "error"])

#    break
Ejemplo n.º 14
0
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 
                                             os.path.pardir)))
from unicodeManager import UnicodeReader, UnicodeWriter
from evalRenamingHelper import *


try:
    csv_path = os.path.abspath(sys.argv[1])
    output_path = os.path.abspath(sys.argv[2])
except:
    print("usage: python evalRenamings.py csv_in csv_out")
    quit()


reader = UnicodeReader(open(csv_path))
writer = UnicodeWriter(open(output_path, 'w'))

nextID = 0
IDMap = {}

for row in reader:
    #Also, let's create an id based on file, name's line num and location (definition)
    file = row[0]
    orig = row[1]
    line_num = row[4]
    token_pos = row[5]
    suggestion = row[6]
    js_nice_name = row[7]
    IDKey = (file, line_num, token_pos)
    if(IDKey in IDMap):
        rowID = IDMap[IDKey]
Ejemplo n.º 15
0
    
    
corpus_root = os.path.abspath(sys.argv[1])
training_sample_path = sys.argv[2]

log_path = sys.argv[3]
num_threads = int(sys.argv[4])


with open(training_sample_path, 'r') as f:

    reader = UnicodeReader(f)

    flog = 'log_dos2unix_' + os.path.basename(training_sample_path)
 
    try:
        for f in [flog]:
            os.remove(os.path.join(log_path, f))
    except:
        pass

    pool = multiprocessing.Pool(processes=num_threads)

    for result in pool.imap_unordered(processFile, reader):
      
        with open(os.path.join(log_path, flog), 'a') as g:
            writer = UnicodeWriter(g)
            writer.writerow(result)
  

Ejemplo n.º 16
0
test_sample_path = sys.argv[2]

output_path = Folder(sys.argv[3]).create()
num_threads = int(sys.argv[4])

with open(test_sample_path, 'r') as f:

    reader = UnicodeReader(f)

    flog = 'log_' + os.path.basename(test_sample_path)
    try:
        for f in [flog]:
            os.remove(os.path.join(output_path, f))
    except:
        pass

    pool = multiprocessing.Pool(processes=num_threads)

    for result in pool.imap_unordered(processFile, reader):

        with open(os.path.join(output_path, flog), 'a') as g:
            writer = UnicodeWriter(g)

            if result[1] is not None:
                (js_file_path, n_lines, max_line_len) = result

                writer.writerow([js_file_path, 'OK', n_lines, max_line_len])

            else:
                writer.writerow(result)