def get_pred_ref(input_file): import tokenizer prep('preparing predictions list... ') preds = dict() predicts = open(input_file, 'r') for c, line in enumerate(predicts): (fid, pred) = line.split('\t') fid = int(fid) pred = pred.split() pred = fil(pred) preds[fid] = pred predicts.close() drop() re_0001_ = re.compile(r'([^a-zA-Z0-9 ])|([a-z0-9_][A-Z])' ) # not sure what this is ? vocabulary? refs = dict() newpreds = dict() d = 0 targets = open('%s/coms.test' % (dataprep), 'r') for line in targets: (fid, com) = line.split(',') fid = int(fid) com = com.split() com = fil(com) try: newpreds[fid] = preds[fid] except KeyError as ex: continue refs[fid] = [com] return newpreds, refs
outfile = args.outfile zerodats = args.zerodats datfile = args.datfile testval = args.testval if outfile is None: outfile = modelfile.split('/')[-1] K.set_floatx(args.dtype) os.environ['CUDA_VISIBLE_DEVICES'] = gpu os.environ['TF_CPP_MIN_LOG_LEVEL'] = args.tf_loglevel sys.path.append(dataprep) import tokenizer prep('loading tokenizers... ') tdatstok = pickle.load(open('%s/tdats.tok' % (dataprep), 'rb'), encoding='UTF-8') comstok = pickle.load(open('%s/coms.tok' % (dataprep), 'rb'), encoding='UTF-8') smltok = pickle.load(open('%s/smls.tok' % (dataprep), 'rb'), encoding='UTF-8') drop() prep('loading sequences... ') seqdata = pickle.load(open('%s/%s' % (dataprep, datfile), 'rb')) drop() print(zerodats) if zerodats == 'yes': zerodats = True else: zerodats = False print(zerodats)
dataprep = args.dataprep gpu = args.gpu batch_size = args.batch_size epochs = args.epochs modeltype = args.modeltype multigpu = args.multigpu K.set_floatx(args.dtype) os.environ['TF_CPP_MIN_LOG_LEVEL'] = args.tf_loglevel sys.path.append(dataprep) import tokenizer init_tf(gpu) prep('loading tokenizers... ') tdatstok = pickle.load(open('%s/tdats.tok' % (dataprep), 'rb'), encoding='UTF-8') sdatstok = pickle.load(open('%s/sdats.tok' % (dataprep), 'rb'), encoding='UTF-8') comstok = pickle.load(open('%s/coms.tok' % (dataprep), 'rb'), encoding='UTF-8') smltok = pickle.load(open('%s/smls.tok' % (dataprep), 'rb'), encoding='UTF-8') drop() prep('loading sequences... ') seqdata = pickle.load(open('%s/dataset.pkl' % (dataprep), 'rb')) drop() steps = int(len(seqdata['ctrain']) / batch_size) + 1
if obfuscate: dataprep = '../data/obfuscation/output' if sbt: dataprep = '../data/sbt/output' if input_file is None: print('Please provide an input file to test with --input') exit() sys.path.append(dataprep) import tokenizer prep('preparing predictions list... ') preds = dict() predicts = open(input_file, 'r') for c, line in enumerate(predicts): (fid, pred) = line.split('\t') fid = int(fid) pred = pred.split() pred = fil(pred) preds[fid] = pred predicts.close() drop() re_0001_ = re.compile(r'([^a-zA-Z0-9 ])|([a-z0-9_][A-Z])') refs = list() newpreds = list()
regex = re.compile('|'.join(map(re.escape, substrings))) return regex.sub(lambda match: substitutions[match.group(0)], string) def qID(): global qid qid +=1 return qid def aID(): global aid aid +=1 return aid datasetloc = 'srcmldat' # loading srcml to aid finding the elements in code prep('loading srcmlunits... ') srcmlunits = pickle.load(open(datasetloc + '/srcml-standard.pkl', 'rb')) sml2 = pickle.load(open(datasetloc + '/srcml-final-allcoms.pkl', 'rb')) for key, val in sml2.items(): srcmlunits[key] = val drop() class MyHTMLParser(HTMLParser): def __init__(self): super(MyHTMLParser, self).__init__() self.parentstack = list() self.qasynth = dict() self.qasynth2 = dict() self.dataseq = list()
import networkx as nx import re import statistics import numpy as np def load_good_fid(): filename = './output/dataset.coms' good_fid = [] for line in open(filename): tmp = [x.strip() for x in line.split(',')] fid = int(tmp[0]) good_fid.append(fid) return good_fid prep('loading srcmlunits... ') srcmlunits = pickle.load(open('srcml-standard.pkl', 'rb')) sml2 = pickle.load(open('fundatsparsed-srcml-final-allcoms.pkl', 'rb')) for key, val in sml2.items(): srcmlunits[key] = val drop() def re_0002(i): # split camel case and remove special characters tmp = i.group(0) if len(tmp) > 1: if tmp.startswith(' '): return tmp else:
import math import traceback import argparse import signal import atexit import time import random import tensorflow as tf import numpy as np import networkx as nx from myutils import prep, drop prep('loading sequences... ') seqdata = pickle.load(open('/nfs/projects/attn-to-fc/data/standard_3dfiles_graphast/dataset.pkl', 'rb')) drop() #fid=122380 #print(wsmlnodes) #print(wsmledges) def idx2tok(nodelist, path): out = list() for idx in path: out.append(nodelist[idx]) return out # one way