def getFilesToLex(top_lvl_dir, extension): """ Desc: Get all filepaths recursively from top_lvl_dir that match extension Parameters: ---------- top_lvl_dir - a directory that is the head of a project or corpus. We will recursively examine all files under it extension - A wildcard regex for the type of extension to collect (e.g. *.java) Returns: -------- list of filepaths matching extension in top_lvl_dir and any subdirector of it. """ # Path to root folder containing the source code basePath = os.path.abspath(top_lvl_dir) if (os.path.isdir(basePath)): inputFolder = Folder(basePath) fileList = inputFolder.fullFileNames(extension, recursive=True) if (extension == "*.c"): #For c, check for header files too. fileList += inputFolder.fullFileNames("*.h", recursive=True) else: #It's a single file fileList = [basePath] return fileList
print( "usage: python wikiCorpusLexer.py inputDir outputDir filterStopwords [stopwordsFiles]" ) print("Subset is 0 or 1, 0 keeps all words, 1 filters the stop words.") print("If 1, supply a file containing the stop words.") quit() stopwords = [] if (noStopwords == 1): with open(stopwordsFile, 'r') as f: for line in f: stopwords.append(line.lower().strip()) basePath = os.path.abspath(inputDir) corpusFolder = Folder(basePath) inArticle = False articleText = [] i = 0 for path in corpusFolder.fullFileNames("*.txt", recursive=False): #Read in inputFile with codecs.open( path, 'r', encoding='latin1', errors='ignore' ) as f: #Wikipedia English is UTF-08, so there shouldn't be errors? for line in f: if (line.startswith("<doc") ): #Some metadata here that might be useful inArticle = True elif (line.startswith("</doc")):
corpus_files = [os.path.basename(f) \ for f in glob.glob(os.path.join(root_path, "*.js")) \ if os.path.basename(f).startswith("corpus.") and \ os.path.basename(f) != "corpus.orig.js"] suffixes = [f[7:-3] for f in corpus_files] for idx, suffix in enumerate(suffixes): variant = "train." + suffix text = '''#!/bin/bash -l # /home/bogdanv/mosesdecoder/scripts/training/mert-moses.pl %s %s --decoder-flags="-threads %s" --nbest=10 /home/bogdanv/mosesdecoder/bin/moses %s --mertdir /home/bogdanv/mosesdecoder/bin/ --rootdir /home/bogdanv/mosesdecoder/scripts --working-dir %s &> %s & ''' % (os.path.join(tune_path, "corpus." + suffix + ".js"), \ os.path.join(tune_path, "corpus.orig.js"), \ str(n_cores), os.path.join(root_path, variant, "model", "moses.bin.ini"), \ os.path.join(root_path, variant, "tuning"), \ os.path.join(root_path, variant, "tuning", "mert.out")) with open(os.path.join(root_path, variant, "tune.sh"), "w") as f: f.write(text) st = os.stat(os.path.join(root_path, variant, "tune.sh")) os.chmod(os.path.join(root_path, variant, "tune.sh"), st.st_mode | stat.S_IEXEC) Folder(os.path.join(root_path, variant, "tuning")).create()
import sys import os from folderManager import Folder #This script will produce results similar to wc -w * #and is usefull when * expands past what the default wc #can handle try: inputDir = sys.argv[1] fileExtension = sys.argv[2] except: print("usage: python super_wc.py inputDir") codeFolder = Folder(os.path.abspath(inputDir)) sum = 0 for path in codeFolder.fullFileNames(fileExtension, recursive=False): count = len(open(path, 'r').read().split()) sum += count print(str(count) + " : " + path) print("Total: " + str(sum))
import os from folderManager import Folder inputFolder = Folder("/Users/caseycas/CodeNLP/EnglishSample/all/") fileList = inputFolder.fullFileNames("*.tokens", recursive=True) for path in fileList: fileContents = ''.join(open(path, 'r').readlines()) fileContents = fileContents.replace("<UNK>", "UNK") with open(path, 'w') as f: f.write(fileContents4)
#File: (Full Path) #<import statement 1> #... #File: (Full Path) #<import statment 2> if len(sys.argv) < 2: print('Usage: python dumpImports.py input_dir ext output_file') print( "Example: python dumpImports.py ~/CodeNLP/HaskellProjects/ *.hs haskellImports.txt" ) quit() print(sys.argv) codeFolder = Folder(os.path.abspath(sys.argv[1])) # File type to be considered fileExtension = sys.argv[2] output_file = sys.argv[3] internalCount = 0 externalCount = 0 with open(output_file, 'w') as out: for path in codeFolder.fullFileNames(fileExtension, recursive=True): out.write("File: " + path + "\n") try: with open(path, 'r') as f: for line in f: line = line.replace("\n", "") if (line.strip().startswith("import ") or line.strip().startswith("open ")):
no_renaming, basic_renaming, normalized, # hash_renaming, hash_def_one_renaming, hash_def_two_renaming) except Exception, e: cleanup(temp_files) return (js_file_path, None, str(e)) corpus_root = os.path.abspath(sys.argv[1]) training_sample_path = os.path.abspath(sys.argv[2]) output_path = Folder(sys.argv[3]).create() num_threads = int(sys.argv[4]) Folder(os.path.join(output_path, 'orig')).create() Folder(os.path.join(output_path, 'no_renaming')).create() Folder(os.path.join(output_path, 'basic_renaming')).create() Folder(os.path.join(output_path, 'normalized')).create() # Folder(os.path.join(output_path, 'hash_renaming')).create() Folder(os.path.join(output_path, 'hash_def_one_renaming')).create() Folder(os.path.join(output_path, 'hash_def_two_renaming')).create() # seen = set(Folder(os.path.join(output_path, 'orig')).baseFileNames('*.js')).\ # intersection(Folder(os.path.join(output_path, 'no_renaming')).baseFileNames('*.js')).\ # intersection(Folder(os.path.join(output_path, 'basic_renaming')).baseFileNames('*.js')).\ # intersection(Folder(os.path.join(output_path, 'normalized')).baseFileNames('*.js')).\
if ok: mc = MiniChecker('tmp_%d.b.js' % pid) try: isMini = mc.compare(keep_mini=False) except Exception as e: isMini = str(e) cleanup(pid) return [os.path.basename(js_file_path), isMini] else: cleanup(pid) return [os.path.basename(js_file_path), 'Beautifier failed'] except TimeExceededError: cleanup(pid) return [os.path.basename(js_file_path), 'Timeout'] corpus_dir = Folder(sys.argv[1]) pool = multiprocessing.Pool(processes=8) with open('isMinified.csv', 'wb') as f: writer = UnicodeWriter(f) for line in pool.imap(processFile, corpus_dir.fullFileNames("*.js")): writer.writerow(line)
import sys import os from folderManager import Folder from unicodeManager import UnicodeWriter from utilities import * from fileUtilities import * from pygments.lexers import get_lexer_for_filename from pygments import lex import dictUtils import pickle try: # Path to root folder containing the source code basePath = os.path.abspath(sys.argv[1]) codeFolder = Folder(basePath) #print(codeFolder) # File type to be considered fileExtension = sys.argv[2] except: print("usage: python apiFinder.py input_dir file_ext") print("e.g. python apiFinder.py ~/CodeNLP/HaskellProjects/ *.hs") quit() #Project -> File (Or Class?) -> functionName corpusDefinitions = {} for path in codeFolder.fullFileNames(fileExtension, recursive=False):
parser = argparse.ArgumentParser( description="Kenlm doesn't handle existing <unk> so let's hide them as UNK" ) parser.add_argument( "input_dir", help="Directory to look at. Since the explicity <unk> is used only" + "with the lstm sets, we we replace unks in train, valid, and test files only.", action="store", type=str) lstm_files = ["train", "valid", "test"] #lstm_files = ["train_valid"] args = parser.parse_args() baseDir = Folder(args.input_dir) #fileList = baseDir.fullFileNames("train_valid",True) #print(fileList) #quit() fileList = [os.path.join(args.input_dir, f) for f in lstm_files] print(fileList) for path in fileList: fileContents = ''.join(open(path, 'r').readlines()) fileContents = fileContents.replace("<unk>", "UNK") with open(path, 'w') as f: f.write(fileContents)
return numpy.median(numpy.array(lst)) def read_in_chunks(file_object, chunk_size=1024): """Lazy function (generator) to read a file piece by piece. Default chunk size: 1k.""" while True: data = file_object.read(chunk_size) if not data: break yield data lexer = get_lexer_for_filename("jsFile.js") f = open('really_big_file.dat') corpora = Folder(sys.argv[1]).fullFileNames("*.js") for path_corpus in [c for c in corpora if 'orig' in c or 'no_renaming' in c or 'hash_def_one_renaming' in c]: print os.path.basename(path_corpus) f = open(path_corpus) names = set([]) for piece in read_in_chunks(f): #process_data(piece) tokens = lex(piece, lexer).tokenList names.update([token for (token_type, token) in tokens if is_token_subtype(token_type, Token.Name)])
def check(f, keep_mini): mc = MiniChecker(f) try: return mc.compare(keep_mini=keep_mini) except Exception as e: return e print 'is_minified(../test_file1.js):', check('../test_file1.js', False) print 'is_minified(../node_scoper/test_input.js):', check( '../node_scoper/test_input.js', False) print 'is_minified(../test_file2.js):', check('../test_file2.js', False) print from folderManager import Folder for js_file in sorted( Folder('../data/js_files.sample').fullFileNames("*.js"), key=lambda f: int(os.path.basename(f).split('.')[0])): prepro = Preprocessor(js_file) prepro.write_temp_file('tmp.js') beauty = Beautifier() ok = beauty.run('tmp.js', 'tmp.b.js') os.remove('tmp.js') if ok: print 'is_minified(%s):' % os.path.basename(js_file), check( 'tmp.b.js', False) os.remove('tmp.b.js')
import os from folderManager import Folder codeFolder = Folder("/Users/caseycas/CodeNLP/EFLCorpus/TECCL_Corpus_V1.1/02TECCL_V1.1_POS/") fileList = codeFolder.fullFileNames("*.txt") print(len(fileList)) for path in fileList: print(path) lines = [] with open(path, 'r') as f: for line in f: lines.append(line.replace("<s>", "").replace("</s>", "")) with open(path, 'w') as f: for line in lines: f.write(line)
import sys import os sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) import glob from folderManager import Folder root_path = os.path.abspath(sys.argv[1]) target_path = os.path.abspath(sys.argv[2]) for corpus in Folder(root_path).subfoldersNoHidden(): print os.path.join(target_path, os.path.basename(corpus)) o = Folder(os.path.join(target_path, os.path.basename(corpus))).create() corpus_files = [f \ for f in glob.glob(os.path.join(os.path.join(root_path, corpus), "*.js")) \ if os.path.basename(f).startswith("corpus.")] for f in corpus_files: with open(os.path.join(target_path, os.path.basename(corpus), os.path.basename(f)), "w") as g: for row in open(f).readlines(): if len(row.split()) <= 20: g.write(row)
"Collapsed replaces all the name types (plus Keyword.Type) with their label." ) print("Collapsed option also forces the string option to 1.") print("And finally two y/n options on whether to keep files with over " + str(MAX_SIZE)) print( "tokens and if we want to explicitly have the type <Token|Type> in the output files." ) quit() print(sys.argv) # Path to root folder containing the source code basePath = os.path.abspath(sys.argv[1]) codeFolder = Folder(basePath) #print(codeFolder) # File type to be considered fileExtension = sys.argv[2] # Path to output file with tokenized code outputDir = sys.argv[3] #outputFile = open(os.path.abspath(sys.argv[3]), 'wb') #writer = UnicodeWriter(outputFile) #String flag strFlag = int(sys.argv[4]) token_split = sys.argv[5]
mosesStatus = checkMosesServers( moses_url_dict) #Eventually turn into list of failed servers #Do a simple kill and restart for the moment (can change to something more selective later). if (args.debug): print(mosesStatus) for port, status in mosesStatus.iteritems(): if (status == "E" or status == "F"): mosesFail = True if (not mosesFail): #Stop checking once the servers are online. break print("Servers are online.") if (args.batch): inputFolder = Folder(os.path.abspath(args.input)) fileList = inputFolder.fullFileNames("*.js", recursive=False) for next_file in fileList: print("Renaming " + str(next_file)) base_file = ntpath.basename(next_file) output_file = \ os.path.join(args.output, base_file[:base_file.rfind(".")] + ".out.js") try: processFile(next_file, output_file, args) except: print("Renaming of " + str(next_file) + " failed.") else: print("Renaming " + str(args.input)) processFile(args.input, args.output, args)
inputDir = args.inputDir fileType = args.fileType outputFile = args.outputFile ngramOrder = args.ngramOrder testLocation = args.testLocation projectMap = args.projectMap trackTypes = args.trackTypes independentSplit = args.independentSplit assert (ngramOrder > 0) if (args.independentSplit): #Divide the corpus into two. #1) Get set of files in the base directory. basePath = os.path.abspath(inputDir) codeFolder = Folder(basePath) #These variant requires regexes of the form *.<ext> not .*.<ext> fileList = codeFolder.fullFileNames(fileType[1:], recursive=False) #2) Divide into 2 randomly. splitFiles = listUtils.partitionList(fileList) #3) Save each in a temp directory. (cd ../input_dir mkdir [rank|freq]_input_dir) (use ln -s to copy) (parentDir, localDir) = os.path.split(basePath) rankDir = os.path.join(parentDir, "rank_" + localDir) freqDir = os.path.join(parentDir, "freq_" + localDir) #print(splitFiles[0]) #print(splitFiles[1]) #print(len(splitFiles[0])) #print(len(splitFiles[1])) #print(rankDir) #print(freqDir) #quit()
''' Select a random sample from a corpus folder. ''' import os import random from shutil import copyfile import sys sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) from folderManager import Folder corpus_dir = Folder(sys.argv[1]) sample_size = int(sys.argv[2]) out_dir = Folder(sys.argv[3]).create() corpus_sample = random.sample(corpus_dir.fullFileNames("*.js"), sample_size) for f in corpus_sample: copyfile(f, os.path.join(out_dir, os.path.basename(f)))
with open(os.path.join(output_path, r_strategy, js_file_path), 'w') as f: f.write(text) return (js_file_path, 'OK', None) except Exception, e: return (js_file_path, None, str(e).replace("\n", "")) if __name__ == "__main__": corpus_root = os.path.abspath(sys.argv[1]) sample_path = os.path.abspath(sys.argv[2]) output_path = Folder(sys.argv[3]).create() num_threads = int(sys.argv[4]) flog = 'log_renameAndUglifyMem_' + os.path.basename(corpus_root) RS = RenamingStrategies() Folder(os.path.join(output_path, 'orig')).create() for r_strategy in RS.all(): Folder(os.path.join(output_path, r_strategy)).create() with open(sample_path, 'r') as f: reader = UnicodeReader(f) pool = multiprocessing.Pool(processes=num_threads)
import sys import os from folderManager import Folder from unicodeManager import UnicodeWriter from utilities import * from pygments.lexers import get_lexer_for_filename from pygments import lex if len(sys.argv) < 4: print 'Usage: python lex.py path_to_code_folder file_name_extension output_file' exit() print sys.argv # Path to root folder containing the source code codeFolder = Folder(os.path.abspath(sys.argv[1])) # File type to be considered fileExtension = sys.argv[2] # Path to output file with tokenized code outputFile = open(os.path.abspath(sys.argv[3]), 'wb') writer = UnicodeWriter(outputFile) for path in codeFolder.fullFileNames(fileExtension, recursive=True): try: fileContents = ''.join(open(path, 'r').readlines()) lexer = get_lexer_for_filename(path) tokens = lex(fileContents, lexer) # returns a generator of tuples tokensList = list(tokens) language = languageForLexer(lexer)
"\n") return ( js_file_path, orig, no_renaming, # basic_renaming, # normalized, hash_def_one_renaming, hash_def_two_renaming) except Exception, e: return (js_file_path, None, str(e)) files_root = os.path.abspath(sys.argv[1]) output_path = Folder(sys.argv[2]).create() sample_size = int(sys.argv[3]) num_threads = int(sys.argv[4]) flog = 'log_renameAndUglify' in_log = set([]) reader = UnicodeReader(open(os.path.join(files_root, flog), 'r')) try: for row in reader: if row[1] == 'OK': in_log.add(row[0]) except: pass print len(in_log), 'in log'
''' Split JS files into training, tuning, and testing ''' import os import random import sys sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) from folderManager import Folder from unicodeManager import UnicodeReader, UnicodeWriter corpus_dir = Folder(sys.argv[1]) def str_to_bool(s): if s == 'True': return True return False isMini = {} reader = UnicodeReader(open('isMinified.csv', 'r')) for row in reader: isMini[row[0]] = str_to_bool(row[1]) eligible = [ os.path.basename(f) for f in corpus_dir.fullFileNames("*.js") if not isMini.get(os.path.basename(f), False)
cleanup(temp_files) cleanupProcessed(base_name) return (js_file_path, False, 'Beautifier fail') cleanup(temp_files) return (js_file_path, True, 'OK') except Exception, e: cleanup(temp_files) return (js_file_path, False, str(e)) corpus_root = os.path.abspath(sys.argv[1]) training_sample_path = os.path.abspath(sys.argv[2]) output_path = Folder(sys.argv[3]).create() num_threads = int(sys.argv[4]) flog = 'log_' + os.path.basename(training_sample_path) try: for f in [flog]: os.remove(os.path.join(output_path, f)) except: pass with open(training_sample_path, 'r') as f, \ open(os.path.join(output_path, flog), 'w') as g: reader = UnicodeReader(f)
import os from folderManager import Folder import codecs import sys import utilities try: inputDir = sys.argv[1] fileType = sys.argv[2] outputDir = sys.argv[3] except: print("usage python Freq.py inputDir fileTypeWildCard outputDir") print("e.g. python Freq.py /Users/caseycas/CodeNLP/HaskellCorpus/files/ *.hs.tokens /Users/caseycas/CodeNLP/HaskellCorpus/files/ 2") quit() # Path to root folder containing the source code codeFolder = Folder(os.path.abspath(sys.argv[1])) for path in codeFolder.fullFileNames(fileType, recursive=False): filename = path.split("/")[-1] print(filename) words = [] with codecs.open(path, 'r', 'utf-8') as f: for line in f: words += line.split() words = [utilities.removeLabel(w) for w in words] with codecs.open(outputDir + filename, 'w', 'utf-8') as f2: f2.write(" ".join(words))