def writeLexedFile(outputDir, fileExtension, lexedWoComments, fileid, flag, explicitWrite): #print(lexedWoComments) assert (len(lexedWoComments) <= MAX_SIZE) #Uncomment if you want to copy the actual java file too. #Commented out to save space. #call(["cp", path, outputDir + "/" + str(i) + "." + fileExtension[2:]]) # Write to file #Output format must be single line and be the input file name + .tokens. with open( outputDir + "/" + str(fileid) + "." + fileExtension[2:] + ".tokens", "wb") as outputFile: writer = UnicodeWriter(outputFile) for t in lexedWoComments: token = t[1] noWS = token.strip() noWS = noWS.replace('\n', '') #Remove new lines noWS = noWS.replace(' ', '_') #Replace any remaining internal spaces if (noWS == ""): continue #if((t[0] != Token.Literal.String) and "." in noWS): # noWS = " ".join(re.split("(\.)", noWS)).strip() if (flag == "labelled" or flag == "android" or flag == "api"): if (explicitWrite == True): noWS = "<" + noWS + "|" + str(t[0]) + ">" #if(flag == "labelled" or flag == "android"): # if(t[0] == Token.Name.Namespace): # noWS = convertNamespaceToken(noWS, "Token.Name.Namespace") # elif(t[0] == Android.Namespace): # noWS = convertNamespaceToken(noWS, "Android.Namespace") # else: # noWS = "<" + noWS + "|" + str(t[0]) + ">" #elif(flag == "name"): # noWS = noWS.replace('.', '') outputFile.write(noWS.encode("utf-8")) #outputFile.write(noWS) outputFile.write(' ') outputFile.write( '\n' ) #Without a new line between each file, there can be a problem with the SRILM ngram tools?
if result[1] is not None: file_name, ok, candidates = result orig.setdefault(file_name, {}) for (def_scope, name, glb) in candidates: orig[file_name][def_scope] = (name, glb) # print file_name, (name, glb) # orig[file_name].setdefault(def_scope, []) # orig[file_name][def_scope].append(name) else: print result[0], result[2] writer = UnicodeWriter(open(os.path.join(results_path, 'stats.csv'), 'w')) writer.writerow( ['file', 'num_names', 'num_glb_names', 'num_loc_names'] + [n2s[i].replace('.', '_') for i in range(len(strategies))] + [n2s[i].replace('.', '_') + '_all' for i in range(len(strategies))] + [n2s[i].replace('.', '_') + '_glb' for i in range(len(strategies))] + [n2s[i].replace('.', '_') + '_maybe' for i in range(len(strategies))]) for file_name in orig.iterkeys(): row = [file_name] counts_loc = [0] * len(strategies) counts_glb = [0] * len(strategies) counts = [0] * len(strategies) alt_counts = [0] * len(strategies) num_names = 0
output_path = Folder(sys.argv[3]).create() num_threads = int(sys.argv[4]) flog = 'log_' + os.path.basename(training_sample_path) try: for f in [flog]: os.remove(os.path.join(output_path, f)) except: pass with open(training_sample_path, 'r') as f, \ open(os.path.join(output_path, flog), 'w') as g: reader = UnicodeReader(f) writer = UnicodeWriter(g) pool = multiprocessing.Pool(processes=num_threads) for result in pool.imap_unordered(processFile, reader): if result[1]: (js_file_path, ok, msg) = result writer.writerow([js_file_path, msg]) else: writer.writerow([result[0], result[2]])
try: for f in [f1, f2, f5, f6, glog]: os.remove(os.path.join(output_path, f)) except: pass with open(os.path.join(output_path, glog), 'w') as g, \ open(os.path.join(output_path, f1), 'w') as f_orig, \ open(os.path.join(output_path, f2), 'w') as f_no_renaming, \ open(os.path.join(output_path, f5), 'w') as f_hash_def_one_renaming, \ open(os.path.join(output_path, f6), 'w') as f_hash_def_two_renaming: # open(os.path.join(output_path, f3), 'w') as f_basic_renaming, \ # open(os.path.join(output_path, f4), 'w') as f_normalized, \ writer = UnicodeWriter(g) pool = multiprocessing.Pool(processes=num_threads) for result in pool.imap_unordered(processFile, corpus_sample): if result[1] is not None: ( js_file_path, orig, no_renaming, # basic_renaming, # normalized, hash_def_one_renaming, hash_def_two_renaming) = result
if ok: mc = MiniChecker('tmp_%d.b.js' % pid) try: isMini = mc.compare(keep_mini=False) except Exception as e: isMini = str(e) cleanup(pid) return [os.path.basename(js_file_path), isMini] else: cleanup(pid) return [os.path.basename(js_file_path), 'Beautifier failed'] except TimeExceededError: cleanup(pid) return [os.path.basename(js_file_path), 'Timeout'] corpus_dir = Folder(sys.argv[1]) pool = multiprocessing.Pool(processes=8) with open('isMinified.csv', 'wb') as f: writer = UnicodeWriter(f) for line in pool.imap(processFile, corpus_dir.fullFileNames("*.js")): writer.writerow(line)
RS = RenamingStrategies() proxies = MosesProxy().getProxies() with open(testing_sample_path, 'r') as f: reader = UnicodeReader(f) pool = multiprocessing.Pool(processes=num_threads) for result in pool.imap_unordered(processFile, reader): with open(os.path.join(output_path, flog), 'a') as g, \ open(os.path.join(output_path, c_path), 'a') as c, \ open(os.path.join(output_path, s_path), 'a') as sM: writer = UnicodeWriter(g) cw = UnicodeWriter(c) sM_writer = UnicodeWriter(sM) if result[1] is not None: js_file_path, ok, candidates, model_rows = result writer.writerow([js_file_path, ok]) for r in candidates: cw.writerow([js_file_path] + [str(x).replace("\"", "") for x in r]) for row in model_rows: sM_writer.writerow([js_file_path] + row) else:
corpus_root = os.path.abspath(sys.argv[2]) output_path = Folder(sys.argv[3]).create() num_threads = int(sys.argv[4]) with open(sample_path, 'r') as f: reader = UnicodeReader(f) flog = 'log_js_functions_' + os.path.basename(sample_path) try: for f in [flog]: os.remove(os.path.join(output_path, f)) except: pass pool = multiprocessing.Pool(processes=num_threads) for result in pool.imap_unordered(processFile, reader): with open(os.path.join(output_path, flog), 'a') as g: writer = UnicodeWriter(g) if result[1] is not None: writer.writerow(result) else: writer.writerow([result[0], result[2]])
PREFIX_NAME = 'PREFIX_NAME' LOGIN_NAME = 'LOGIN_NAME' FULL_NAME = 'FULL_NAME' SIMPLE_NAME = 'SIMPLE_NAME' LOCATION = 'LOCATION' DOMAIN = 'EMAIL_DOMAIN' TWO = 'TWO_OR_MORE_RULES' THR_MIN = 1 THR_MAX = 10 unmask = {} dataPath = os.path.abspath('../data') w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_log.csv'), 'wb')) writer = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_map.csv'), 'wb')) w_maybe = UnicodeWriter( open(os.path.join(dataPath, 'idm', 'idm_maybe.csv'), 'wb')) idx = 0 step = 100000 curidx = step aliases = {} # reader = UnicodeReader(open(os.path.join(dataPath, 'users_clean_emails_sample.csv'), 'rb')) reader = UnicodeReader( open(os.path.join(dataPath, 'active_prolific_users.csv'), 'rb')) _header = reader.next()
PREFIX_NAME = 'PREFIX_NAME' LOGIN_NAME = 'LOGIN_NAME' FULL_NAME = 'FULL_NAME' SIMPLE_NAME = 'SIMPLE_NAME' LOCATION = 'LOCATION' DOMAIN = 'EMAIL_DOMAIN' TWO = 'TWO_OR_MORE_RULES' THR_MIN = 1 THR_MAX = 10 unmask = {} dataPath = os.path.abspath('../../data/2014-01') w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_log.csv'), 'wb')) writer = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_map.csv'), 'wb')) w_maybe = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_maybe.csv'), 'wb')) idx = 0 step = 100000 curidx = step aliases = {} # reader = UnicodeReader(open(os.path.join(dataPath, 'users_clean_emails_sample.csv'), 'rb')) reader = UnicodeReader(open(os.path.join(dataPath, 'clean', 'users_clean_emails.csv'), 'rb')) _header = reader.next() # Helper structures d_email_uid = {}
] size = len(eligible) tt = int(0.8 * size) training_size = int(0.9 * tt) tuning_size = int(tt - training_size) testing_size = size - tt print 'Total:', size print 'Training:', training_size print 'Tuning:', tuning_size print 'Testing:', testing_size training_sample = random.sample(eligible, training_size) with open('trainingSample.csv', 'wb') as of: writer = UnicodeWriter(of) for f in training_sample: writer.writerow([f]) tuning_sample = random.sample( set(eligible).difference(set(training_sample)), tuning_size) with open('tuningSample.csv', 'wb') as of: writer = UnicodeWriter(of) for f in tuning_sample: writer.writerow([f]) testing_sample = random.sample( set(eligible).difference(set(training_sample)).difference( set(tuning_sample)), testing_size) with open('testingSample.csv', 'wb') as of: writer = UnicodeWriter(of)
os.remove(os.path.join(output_path, f)) except: pass # inputs = Folder(corpus_root).fullFileNames("*.js") with open(testing_sample_path, 'r') as f: reader = UnicodeReader(f) # result = processFile(reader.next()) pool = multiprocessing.Pool(processes=num_threads) for result in pool.imap_unordered(processFile, reader): # if True: with open(os.path.join(output_path, flog), 'a') as g, \ open(os.path.join(output_path, c_path), 'a') as c: writer = UnicodeWriter(g) cw = UnicodeWriter(c) if result[1] is not None: js_file_path, ok, candidates = result writer.writerow([js_file_path, ok]) for r in candidates: cw.writerow([js_file_path] + [str(x).replace("\"", "") for x in list(r)]) else: writer.writerow([result[0], result[2]])
num_threads = int(sys.argv[3]) flog = 'log_sanity' try: for f in [flog]: os.remove(os.path.join(results_root, f)) except: pass with open(file_list_path, 'r') as f: reader = UnicodeReader(f) # result = processFile(reader.next()) pool = multiprocessing.Pool(processes=num_threads) for result in pool.imap_unordered(processFile, reader): # for row in reader: # result = processFile(row) # print 'result', result with open(os.path.join(results_root, flog), 'a') as g: writer = UnicodeWriter(g) if result[1] is not None: js_file_path, ok, method = result writer.writerow([js_file_path, ok, method]) else: writer.writerow([result[0], result[2], ''])
input_dir = sys.argv[1] output_csv = sys.argv[2] num_threads = int(sys.argv[3]) base_dir = Folder(input_dir) fileList = base_dir.baseFileNames("*.js") origList = [next for next in fileList if next.count(".") == 1] toProcess = [(nextFile, os.path.join(base_dir.path, nextFile)) for nextFile in origList] #print(fileList) with open(output_csv, 'w') as g: pass pool = multiprocessing.Pool(processes=num_threads) for result in pool.imap_unordered(processFile, toProcess): #for file_pair in toProcess: # result = processFile(file_pair) print(result[0]) with open(output_csv, 'a') as g: writer = UnicodeWriter(g) if result[1] is not None: writer.writerow(result) else: writer.writerow([result[0], "error"]) # break
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) from unicodeManager import UnicodeReader, UnicodeWriter from evalRenamingHelper import * try: csv_path = os.path.abspath(sys.argv[1]) output_path = os.path.abspath(sys.argv[2]) except: print("usage: python evalRenamings.py csv_in csv_out") quit() reader = UnicodeReader(open(csv_path)) writer = UnicodeWriter(open(output_path, 'w')) nextID = 0 IDMap = {} for row in reader: #Also, let's create an id based on file, name's line num and location (definition) file = row[0] orig = row[1] line_num = row[4] token_pos = row[5] suggestion = row[6] js_nice_name = row[7] IDKey = (file, line_num, token_pos) if(IDKey in IDMap): rowID = IDMap[IDKey]
corpus_root = os.path.abspath(sys.argv[1]) training_sample_path = sys.argv[2] log_path = sys.argv[3] num_threads = int(sys.argv[4]) with open(training_sample_path, 'r') as f: reader = UnicodeReader(f) flog = 'log_dos2unix_' + os.path.basename(training_sample_path) try: for f in [flog]: os.remove(os.path.join(log_path, f)) except: pass pool = multiprocessing.Pool(processes=num_threads) for result in pool.imap_unordered(processFile, reader): with open(os.path.join(log_path, flog), 'a') as g: writer = UnicodeWriter(g) writer.writerow(result)
test_sample_path = sys.argv[2] output_path = Folder(sys.argv[3]).create() num_threads = int(sys.argv[4]) with open(test_sample_path, 'r') as f: reader = UnicodeReader(f) flog = 'log_' + os.path.basename(test_sample_path) try: for f in [flog]: os.remove(os.path.join(output_path, f)) except: pass pool = multiprocessing.Pool(processes=num_threads) for result in pool.imap_unordered(processFile, reader): with open(os.path.join(output_path, flog), 'a') as g: writer = UnicodeWriter(g) if result[1] is not None: (js_file_path, n_lines, max_line_len) = result writer.writerow([js_file_path, 'OK', n_lines, max_line_len]) else: writer.writerow(result)