def getFilesToLex(top_lvl_dir, extension):
    """
    Desc: Get all filepaths recursively from top_lvl_dir that
    match extension

    Parameters:
    ----------
    top_lvl_dir - a directory that is the head of a project or
    corpus.  We will recursively examine all files under it

    extension - A wildcard regex for the type of extension
    to collect (e.g. *.java)

    Returns:
    --------
    list of filepaths matching extension in top_lvl_dir and
    any subdirector of it.
    """

    # Path to root folder containing the source code

    basePath = os.path.abspath(top_lvl_dir)
    if (os.path.isdir(basePath)):
        inputFolder = Folder(basePath)
        fileList = inputFolder.fullFileNames(extension, recursive=True)
        if (extension == "*.c"):  #For c, check for header files too.
            fileList += inputFolder.fullFileNames("*.h", recursive=True)
    else:  #It's a single file
        fileList = [basePath]

    return fileList
    print(
        "usage: python wikiCorpusLexer.py inputDir outputDir filterStopwords [stopwordsFiles]"
    )
    print("Subset is 0 or 1, 0 keeps all words, 1 filters the stop words.")
    print("If 1, supply a file containing the stop words.")
    quit()

stopwords = []

if (noStopwords == 1):
    with open(stopwordsFile, 'r') as f:
        for line in f:
            stopwords.append(line.lower().strip())

basePath = os.path.abspath(inputDir)
corpusFolder = Folder(basePath)

inArticle = False
articleText = []
i = 0

for path in corpusFolder.fullFileNames("*.txt", recursive=False):
    #Read in inputFile
    with codecs.open(
            path, 'r', encoding='latin1', errors='ignore'
    ) as f:  #Wikipedia English is UTF-08, so there shouldn't be errors?
        for line in f:
            if (line.startswith("<doc")
                ):  #Some metadata here that might be useful
                inArticle = True
            elif (line.startswith("</doc")):
Exemple #3
0
corpus_files = [os.path.basename(f) \
                    for f in glob.glob(os.path.join(root_path, "*.js")) \
                    if os.path.basename(f).startswith("corpus.") and \
                        os.path.basename(f) != "corpus.orig.js"]

suffixes = [f[7:-3] for f in corpus_files]

for idx, suffix in enumerate(suffixes):

    variant = "train." + suffix

    text = '''#!/bin/bash -l
#
/home/bogdanv/mosesdecoder/scripts/training/mert-moses.pl %s %s --decoder-flags="-threads %s" --nbest=10 /home/bogdanv/mosesdecoder/bin/moses %s --mertdir /home/bogdanv/mosesdecoder/bin/ --rootdir /home/bogdanv/mosesdecoder/scripts --working-dir %s &> %s &
''' % (os.path.join(tune_path, "corpus." + suffix + ".js"), \
       os.path.join(tune_path, "corpus.orig.js"), \
       str(n_cores),
       os.path.join(root_path, variant, "model", "moses.bin.ini"), \
       os.path.join(root_path, variant, "tuning"), \
       os.path.join(root_path, variant, "tuning", "mert.out"))

    with open(os.path.join(root_path, variant, "tune.sh"), "w") as f:
        f.write(text)

    st = os.stat(os.path.join(root_path, variant, "tune.sh"))
    os.chmod(os.path.join(root_path, variant, "tune.sh"),
             st.st_mode | stat.S_IEXEC)

    Folder(os.path.join(root_path, variant, "tuning")).create()
Exemple #4
0
import sys
import os
from folderManager import Folder

#This script will produce results similar to wc -w *
#and is usefull when * expands past what the default wc
#can handle

try:
    inputDir = sys.argv[1]
    fileExtension = sys.argv[2]
except:
    print("usage: python super_wc.py inputDir")

codeFolder = Folder(os.path.abspath(inputDir))

sum = 0

for path in codeFolder.fullFileNames(fileExtension, recursive=False):
    count = len(open(path, 'r').read().split())
    sum += count
    print(str(count) + " : " + path)

print("Total: " + str(sum))
Exemple #5
0
import os
from folderManager import Folder

inputFolder = Folder("/Users/caseycas/CodeNLP/EnglishSample/all/")
fileList = inputFolder.fullFileNames("*.tokens", recursive=True)

for path in fileList:
    fileContents = ''.join(open(path, 'r').readlines())
    fileContents = fileContents.replace("<UNK>", "UNK")
    with open(path, 'w') as f:
        f.write(fileContents4)
Exemple #6
0
#File: (Full Path)
#<import statement 1>
#...
#File: (Full Path)
#<import statment 2>

if len(sys.argv) < 2:
    print('Usage: python dumpImports.py input_dir ext output_file')
    print(
        "Example: python dumpImports.py ~/CodeNLP/HaskellProjects/ *.hs haskellImports.txt"
    )
    quit()

print(sys.argv)

codeFolder = Folder(os.path.abspath(sys.argv[1]))
# File type to be considered
fileExtension = sys.argv[2]
output_file = sys.argv[3]

internalCount = 0
externalCount = 0
with open(output_file, 'w') as out:
    for path in codeFolder.fullFileNames(fileExtension, recursive=True):
        out.write("File: " + path + "\n")
        try:
            with open(path, 'r') as f:
                for line in f:
                    line = line.replace("\n", "")
                    if (line.strip().startswith("import ")
                            or line.strip().startswith("open ")):
                no_renaming, 
                basic_renaming,
                normalized, 
#                 hash_renaming,
                hash_def_one_renaming,
                hash_def_two_renaming)
        
    except Exception, e:
        cleanup(temp_files)
        return (js_file_path, None, str(e))
    
    
corpus_root = os.path.abspath(sys.argv[1])
training_sample_path = os.path.abspath(sys.argv[2])

output_path = Folder(sys.argv[3]).create()
num_threads = int(sys.argv[4])

Folder(os.path.join(output_path, 'orig')).create()
Folder(os.path.join(output_path, 'no_renaming')).create()
Folder(os.path.join(output_path, 'basic_renaming')).create()
Folder(os.path.join(output_path, 'normalized')).create()
# Folder(os.path.join(output_path, 'hash_renaming')).create()
Folder(os.path.join(output_path, 'hash_def_one_renaming')).create()
Folder(os.path.join(output_path, 'hash_def_two_renaming')).create()


# seen = set(Folder(os.path.join(output_path, 'orig')).baseFileNames('*.js')).\
# intersection(Folder(os.path.join(output_path, 'no_renaming')).baseFileNames('*.js')).\
# intersection(Folder(os.path.join(output_path, 'basic_renaming')).baseFileNames('*.js')).\
# intersection(Folder(os.path.join(output_path, 'normalized')).baseFileNames('*.js')).\
Exemple #8
0
        if ok:
            mc = MiniChecker('tmp_%d.b.js' % pid)
            try:
                isMini = mc.compare(keep_mini=False)
            except Exception as e:
                isMini = str(e)
                
            cleanup(pid)
            return [os.path.basename(js_file_path), isMini]
        
        else:
            cleanup(pid)
            return [os.path.basename(js_file_path), 'Beautifier failed']
        
    except TimeExceededError:
        
        cleanup(pid)
        return [os.path.basename(js_file_path), 'Timeout']
        

    
    
corpus_dir = Folder(sys.argv[1])

pool = multiprocessing.Pool(processes=8)
with open('isMinified.csv', 'wb') as f:
    writer = UnicodeWriter(f)
    for line in pool.imap(processFile, corpus_dir.fullFileNames("*.js")):
        writer.writerow(line)

Exemple #9
0
import sys
import os
from folderManager import Folder
from unicodeManager import UnicodeWriter
from utilities import *
from fileUtilities import *
from pygments.lexers import get_lexer_for_filename
from pygments import lex
import dictUtils
import pickle

try:
    # Path to root folder containing the source code
    basePath = os.path.abspath(sys.argv[1])
    codeFolder = Folder(basePath)

    #print(codeFolder)

    # File type to be considered
    fileExtension = sys.argv[2]
except:
    print("usage: python apiFinder.py input_dir file_ext")
    print("e.g. python apiFinder.py ~/CodeNLP/HaskellProjects/ *.hs")
    quit()

#Project -> File (Or Class?) -> functionName 
corpusDefinitions = {}


for path in codeFolder.fullFileNames(fileExtension, recursive=False):
parser = argparse.ArgumentParser(
    description="Kenlm doesn't handle existing <unk> so let's hide them as UNK"
)

parser.add_argument(
    "input_dir",
    help="Directory to look at.  Since the explicity <unk> is used only" +
    "with the lstm sets, we we replace unks in train, valid, and test files only.",
    action="store",
    type=str)

lstm_files = ["train", "valid", "test"]
#lstm_files = ["train_valid"]

args = parser.parse_args()

baseDir = Folder(args.input_dir)

#fileList = baseDir.fullFileNames("train_valid",True)
#print(fileList)
#quit()
fileList = [os.path.join(args.input_dir, f) for f in lstm_files]
print(fileList)

for path in fileList:
    fileContents = ''.join(open(path, 'r').readlines())
    fileContents = fileContents.replace("<unk>", "UNK")
    with open(path, 'w') as f:
        f.write(fileContents)
Exemple #11
0
    return numpy.median(numpy.array(lst))

def read_in_chunks(file_object, chunk_size=1024):
    """Lazy function (generator) to read a file piece by piece.
    Default chunk size: 1k."""
    while True:
        data = file_object.read(chunk_size)
        if not data:
            break
        yield data

lexer = get_lexer_for_filename("jsFile.js")

f = open('really_big_file.dat')

corpora = Folder(sys.argv[1]).fullFileNames("*.js")

for path_corpus in [c for c in corpora if 'orig' in c or 'no_renaming' in c or 'hash_def_one_renaming' in c]:
    print os.path.basename(path_corpus)
    f = open(path_corpus)

    names = set([])
    
    for piece in read_in_chunks(f):
        #process_data(piece)
        
        tokens = lex(piece, lexer).tokenList
        
        names.update([token for (token_type, token) in tokens
                 if is_token_subtype(token_type, Token.Name)])
    
Exemple #12
0
    def check(f, keep_mini):
        mc = MiniChecker(f)
        try:
            return mc.compare(keep_mini=keep_mini)
        except Exception as e:
            return e

    print 'is_minified(../test_file1.js):', check('../test_file1.js', False)
    print 'is_minified(../node_scoper/test_input.js):', check(
        '../node_scoper/test_input.js', False)
    print 'is_minified(../test_file2.js):', check('../test_file2.js', False)

    print
    from folderManager import Folder
    for js_file in sorted(
            Folder('../data/js_files.sample').fullFileNames("*.js"),
            key=lambda f: int(os.path.basename(f).split('.')[0])):

        prepro = Preprocessor(js_file)
        prepro.write_temp_file('tmp.js')

        beauty = Beautifier()
        ok = beauty.run('tmp.js', 'tmp.b.js')
        os.remove('tmp.js')

        if ok:
            print 'is_minified(%s):' % os.path.basename(js_file), check(
                'tmp.b.js', False)
            os.remove('tmp.b.js')
Exemple #13
0
import os
from folderManager import Folder

codeFolder = Folder("/Users/caseycas/CodeNLP/EFLCorpus/TECCL_Corpus_V1.1/02TECCL_V1.1_POS/")

fileList = codeFolder.fullFileNames("*.txt")
print(len(fileList))

for path in fileList:
	print(path)
	lines = []
	with open(path, 'r') as f:
	    for line in f:
	        lines.append(line.replace("<s>", "").replace("</s>", ""))
	with open(path, 'w') as f:
	    for line in lines:
	        f.write(line)
Exemple #14
0
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 
                                             os.path.pardir)))
import glob
from folderManager import Folder


root_path = os.path.abspath(sys.argv[1])
target_path = os.path.abspath(sys.argv[2])



for corpus in Folder(root_path).subfoldersNoHidden():

    print os.path.join(target_path, 
                        os.path.basename(corpus))
    o = Folder(os.path.join(target_path, 
                        os.path.basename(corpus))).create()

    corpus_files = [f \
                    for f in glob.glob(os.path.join(os.path.join(root_path, corpus), "*.js")) \
                    if os.path.basename(f).startswith("corpus.")]
    
    for f in corpus_files:
        with open(os.path.join(target_path, 
                              os.path.basename(corpus),
                              os.path.basename(f)), "w") as g:
            for row in open(f).readlines():
                if len(row.split()) <= 20:
                    g.write(row)
        "Collapsed replaces all the name types (plus Keyword.Type) with their label."
    )
    print("Collapsed option also forces the string option to 1.")
    print("And finally two y/n options on whether to keep files with over " +
          str(MAX_SIZE))
    print(
        "tokens and if we want to explicitly have the type <Token|Type> in the output files."
    )

    quit()

print(sys.argv)

# Path to root folder containing the source code
basePath = os.path.abspath(sys.argv[1])
codeFolder = Folder(basePath)

#print(codeFolder)

# File type to be considered
fileExtension = sys.argv[2]

# Path to output file with tokenized code
outputDir = sys.argv[3]
#outputFile = open(os.path.abspath(sys.argv[3]), 'wb')
#writer = UnicodeWriter(outputFile)

#String flag
strFlag = int(sys.argv[4])

token_split = sys.argv[5]
Exemple #16
0
        mosesStatus = checkMosesServers(
            moses_url_dict)  #Eventually turn into list of failed servers
        #Do a simple kill and restart for the moment (can change to something more selective later).
        if (args.debug):
            print(mosesStatus)
        for port, status in mosesStatus.iteritems():
            if (status == "E" or status == "F"):
                mosesFail = True

        if (not mosesFail):  #Stop checking once the servers are online.
            break

print("Servers are online.")

if (args.batch):
    inputFolder = Folder(os.path.abspath(args.input))
    fileList = inputFolder.fullFileNames("*.js", recursive=False)
    for next_file in fileList:
        print("Renaming " + str(next_file))
        base_file = ntpath.basename(next_file)
        output_file = \
            os.path.join(args.output,
                base_file[:base_file.rfind(".")] + ".out.js")
        try:
            processFile(next_file, output_file, args)
        except:
            print("Renaming of " + str(next_file) + " failed.")
else:
    print("Renaming " + str(args.input))
    processFile(args.input, args.output, args)
inputDir = args.inputDir
fileType = args.fileType
outputFile = args.outputFile
ngramOrder = args.ngramOrder
testLocation = args.testLocation
projectMap = args.projectMap
trackTypes = args.trackTypes
independentSplit = args.independentSplit

assert (ngramOrder > 0)

if (args.independentSplit):
    #Divide the corpus into two.
    #1) Get set of files in the base directory.
    basePath = os.path.abspath(inputDir)
    codeFolder = Folder(basePath)
    #These variant requires regexes of the form *.<ext> not .*.<ext>
    fileList = codeFolder.fullFileNames(fileType[1:], recursive=False)
    #2) Divide into 2 randomly.
    splitFiles = listUtils.partitionList(fileList)
    #3) Save each in a temp directory. (cd ../input_dir mkdir [rank|freq]_input_dir) (use ln -s to copy)
    (parentDir, localDir) = os.path.split(basePath)
    rankDir = os.path.join(parentDir, "rank_" + localDir)
    freqDir = os.path.join(parentDir, "freq_" + localDir)
    #print(splitFiles[0])
    #print(splitFiles[1])
    #print(len(splitFiles[0]))
    #print(len(splitFiles[1]))
    #print(rankDir)
    #print(freqDir)
    #quit()
Exemple #18
0
'''
Select a random sample from a corpus folder.
'''

import os
import random
from shutil import copyfile

import sys
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))

from folderManager import Folder

corpus_dir = Folder(sys.argv[1])
sample_size = int(sys.argv[2])
out_dir = Folder(sys.argv[3]).create()

corpus_sample = random.sample(corpus_dir.fullFileNames("*.js"), sample_size)

for f in corpus_sample:
    copyfile(f, os.path.join(out_dir, os.path.basename(f)))


            with open(os.path.join(output_path, r_strategy, js_file_path),
                      'w') as f:
                f.write(text)

        return (js_file_path, 'OK', None)

    except Exception, e:
        return (js_file_path, None, str(e).replace("\n", ""))


if __name__ == "__main__":

    corpus_root = os.path.abspath(sys.argv[1])
    sample_path = os.path.abspath(sys.argv[2])

    output_path = Folder(sys.argv[3]).create()
    num_threads = int(sys.argv[4])

    flog = 'log_renameAndUglifyMem_' + os.path.basename(corpus_root)

    RS = RenamingStrategies()

    Folder(os.path.join(output_path, 'orig')).create()
    for r_strategy in RS.all():
        Folder(os.path.join(output_path, r_strategy)).create()

    with open(sample_path, 'r') as f:

        reader = UnicodeReader(f)

        pool = multiprocessing.Pool(processes=num_threads)
import sys
import os
from folderManager import Folder
from unicodeManager import UnicodeWriter
from utilities import *
from pygments.lexers import get_lexer_for_filename
from pygments import lex

if len(sys.argv) < 4:
    print 'Usage: python lex.py path_to_code_folder file_name_extension output_file'
    exit()

print sys.argv

# Path to root folder containing the source code
codeFolder = Folder(os.path.abspath(sys.argv[1]))

# File type to be considered
fileExtension = sys.argv[2]

# Path to output file with tokenized code
outputFile = open(os.path.abspath(sys.argv[3]), 'wb')
writer = UnicodeWriter(outputFile)

for path in codeFolder.fullFileNames(fileExtension, recursive=True):
    try:
        fileContents = ''.join(open(path, 'r').readlines())
        lexer = get_lexer_for_filename(path)
        tokens = lex(fileContents, lexer) # returns a generator of tuples
        tokensList = list(tokens)
        language = languageForLexer(lexer)
Exemple #21
0
                                         "\n")

        return (
            js_file_path,
            orig,
            no_renaming,
            #                 basic_renaming,
            #                 normalized,
            hash_def_one_renaming,
            hash_def_two_renaming)

    except Exception, e:
        return (js_file_path, None, str(e))

files_root = os.path.abspath(sys.argv[1])
output_path = Folder(sys.argv[2]).create()
sample_size = int(sys.argv[3])
num_threads = int(sys.argv[4])

flog = 'log_renameAndUglify'

in_log = set([])
reader = UnicodeReader(open(os.path.join(files_root, flog), 'r'))
try:
    for row in reader:
        if row[1] == 'OK':
            in_log.add(row[0])
except:
    pass
print len(in_log), 'in log'
Exemple #22
0
'''
Split JS files into training, tuning, and testing
'''

import os
import random

import sys
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))

from folderManager import Folder
from unicodeManager import UnicodeReader, UnicodeWriter

corpus_dir = Folder(sys.argv[1])


def str_to_bool(s):
    if s == 'True':
        return True
    return False


isMini = {}
reader = UnicodeReader(open('isMinified.csv', 'r'))
for row in reader:
    isMini[row[0]] = str_to_bool(row[1])

eligible = [
    os.path.basename(f) for f in corpus_dir.fullFileNames("*.js")
    if not isMini.get(os.path.basename(f), False)
Exemple #23
0
            cleanup(temp_files)
            cleanupProcessed(base_name)
            return (js_file_path, False, 'Beautifier fail')

        cleanup(temp_files)
        return (js_file_path, True, 'OK')

    except Exception, e:
        cleanup(temp_files)
        return (js_file_path, False, str(e))


corpus_root = os.path.abspath(sys.argv[1])
training_sample_path = os.path.abspath(sys.argv[2])

output_path = Folder(sys.argv[3]).create()
num_threads = int(sys.argv[4])

flog = 'log_' + os.path.basename(training_sample_path)

try:
    for f in [flog]:
        os.remove(os.path.join(output_path, f))
except:
    pass


with open(training_sample_path, 'r') as f, \
        open(os.path.join(output_path, flog), 'w') as g:

    reader = UnicodeReader(f)
Exemple #24
0
import os
from folderManager import Folder
import codecs
import sys
import utilities

try:
    inputDir = sys.argv[1]
    fileType = sys.argv[2]
    outputDir = sys.argv[3]
except:
    print("usage python Freq.py inputDir fileTypeWildCard outputDir")
    print("e.g. python Freq.py /Users/caseycas/CodeNLP/HaskellCorpus/files/ *.hs.tokens /Users/caseycas/CodeNLP/HaskellCorpus/files/ 2")
    quit()



# Path to root folder containing the source code
codeFolder = Folder(os.path.abspath(sys.argv[1]))
for path in codeFolder.fullFileNames(fileType, recursive=False):
    filename = path.split("/")[-1]
    print(filename)
    words = []
    with codecs.open(path, 'r', 'utf-8') as f:
        for line in f:
            words += line.split()

        words = [utilities.removeLabel(w) for w in words]

    with codecs.open(outputDir + filename, 'w', 'utf-8') as f2:
        f2.write(" ".join(words))