def main(configurationDirectory): pjoin = os.path.join abspath = os.path.abspath psplit = os.path.split pexists = os.path.exists pisdir = os.path.isdir app_conf = load(file(os.path.join(configurationDirectory, "..\\settings\\app_config.yaml"), u'r'), Loader=Loader) view_server_info={} print print "Confero Track Started. " if app_conf.get('view_server',{}).get('address').lower() in [None, 'auto', 'bonjour']: view_server_info = findConferoViewServer() print "Found Confero Server via Bonjour:", view_server_info['ip'], view_server_info['port'] print app_conf.get('view_server')['address'] = view_server_info['ip'] app_conf.get('view_server')['port'] = view_server_info['port'] else: view_server_info['ip'] = app_conf.get('view_server')['address'] view_server_info['port'] = app_conf.get('view_server')['port'] print "Using app_config settings for Confero Server connection:", view_server_info['ip'], view_server_info['port'] print DataCollectionRuntime.view_server_ip = view_server_info['ip'] DataCollectionRuntime.results_root_folder = \ app_conf.get('results_root_folder') if not (pexists(DataCollectionRuntime.results_root_folder) and pisdir(DataCollectionRuntime.results_root_folder)): DataCollectionRuntime.results_root_folder = abspath(pjoin( DataCollectionRuntime.script_dir, DataCollectionRuntime.results_root_folder)) util.createPath(DataCollectionRuntime.results_root_folder) try: ws = createWebsocketInterface(app_conf) except socket.error, e: if e.errno == 10035: pass elif e.errno in [10054,10061]: print('WebSocket could not be connected to feedback server. ' 'Is the server program running?') return 'EXIT_PROGRAM' else: raise e
def plot_graph(cluster_range, all_clust_err, args): if args.emb_technique == "fasttext": embtype = config.fasttext_method+"_"+str(args.emb_dim) else: embtype = args.emb_dim file_name = config.image_path['file_name'].format(embtype, args.clust_range) plt.plot(cluster_range, all_clust_err) plt.title("Error trend for embedding size {}, embedding algorithm {}".format(args.emb_dim,args.emb_technique)) plt.xlabel("Number of clusters") plt.ylabel("Clustering Error") path = config.image_path[args.lang].format(args.emb_technique, config.word_emb_training_type, args.epochs, config.filter_word_on, file_name) util.createPath(path) plt.savefig(path) plt.show()
def __init__(self, inFileName, outFile, lang='EN'): self.lang = lang self.inFile = codecs.open(inFileName, 'r', 'utf-8') util.createPath(outFile) self.outFile = codecs.open(outFile, 'w', 'utf-8') if lang == "EN": self.lmtzr = WordNetLemmatizer() self.stop = stopwords.words('english') self.replacement_dict = DataCorrection.replacement_dict self.relevant_terms = DataCorrection.relevant_terms self.contract = DataCorrection.contraction elif lang == "DE": self.spacy_model_de = spacy.load('de') german_stop = stopwords.words('german') self.stop = [self.umlauts(word) for word in german_stop]
def main(): parser = argparse.ArgumentParser() parser.add_argument("-emb_name", "--emb_name", dest="emb_name", type=str, metavar='<str>', default='w2v', help="Name the embedding algorithm(w2v or fasttext") parser.add_argument("-ft_method", "--fasttext_emb_method", dest="fasttext_emb_method", type=str, metavar='<str>', default='skipgram', help="Name the fasttext embedding algorithm(skipgram or cbow") parser.add_argument("-emb_dim", "--emb_dim", dest="emb_dim", type=int, default=200, help="embedding dimension value") parser.add_argument("-sf", "--source-file", dest="source_file", type=str, metavar='<str>', help="Name the source file") parser.add_argument("-gv_vocab", "--glove_vocab", dest="glove_vocab", type=str, metavar='<str>', help="Name the vocab file") parser.add_argument("-gv_cooccur", "--gv_cooccur", dest="glove_cooccurence", type=str, metavar='<str>', help="Name the cooccurence file") parser.add_argument("-lang", "--lang", dest="lang", type=str, metavar='<str>', default='en', help="Name the language (German/English") args = parser.parse_args() source = config.data_source[args.lang].format(config.filter_word_on) if args.emb_name == "w2v": print('training word embeddings for {}...'.format(args.lang)) emb_dim=args.emb_dim if args.lang == 'en': model_file = config.emb_dir_en['w2v'].format(config.word_emb_training_type)+'/w2v_embedding_' + str(emb_dim) elif args.lang == 'de': model_file = config.emb_dir_de['w2v'].format(config.word_emb_training_type)+'/w2v_embedding_' + str(emb_dim) model=w2v(source,emb_dim) createPath(model_file) model.save(model_file) elif args.emb_name=="fasttext": print('training fasttext word embeddings for {}...'.format(args.lang)) method = args.fasttext_emb_method emb_dim = args.emb_dim model = fasttext_embedding(source, method, emb_dim) if args.lang == 'en': model_file = config.emb_dir_en['fasttext'].format(config.word_emb_training_type)+'/w2v_embedding_'+args.fasttext_emb_method+"_"+str(emb_dim) elif args.lang == 'de': model_file = config.emb_dir_de['fasttext'].format(config.word_emb_training_type)+'/w2v_embedding_'+args.fasttext_emb_method+"_"+str(emb_dim) createPath(model_file) model.save(model_file) elif args.emb_name == 'glove': logger.info('Generate glove embedding...') glove_embedding(source, args.glove_vocab, args.glove_cooccurence, args.lang) else: print("Wrong embedding name")
def glove_embedding(filename, vocab_file, cooccurence_file, lang): gv = Glove() if vocab_file and cooccurence_file: vocab = gv.load_vocab_in_order(vocab_file) cooccurence = gv.load_cooccurence_matrix(cooccurence_file) logger.info('get pre-trained glove embedding') original_embedding = gv.get_original_embedding(config.glove_pretrained_emb[lang]) mittens_model = Mittens(n=300, max_iter=1000) logger.info('Start fine tuning...') new_embeddings = mittens_model.fit(cooccurence, vocab=vocab, initial_embedding_dict=original_embedding) fin = open(config.glove_fine_tuned_emb[lang], 'wb') pickle.dump(new_embeddings, fin) fin.close() logger.info('Fine tuning complete') else: if lang == 'de': logger.info('Load german data') elif lang == 'en': logger.info('Load english data') fin = codecs.open(filename, 'r', 'utf-8') corpus = [] for line in fin: corpus.append(line) vocab = gv.build_vocab(corpus) vocab_file = config.glove_fine_tuned_vocab[lang] createPath(vocab_file) outfile = open(vocab_file, 'wb') pickle.dump(vocab, outfile) outfile.close() logger.info("Fetching cooccurrence list..") cooccurrences = gv.build_cooccur(vocab, corpus) cooccurrences = gv.convert_cooccurence_matrix(cooccurrences, len(vocab)) cooccurrence_file = config.glove_fine_tuned_cooccurance[lang] #outfile = open(cooccurrence_file, 'wb') joblib.dump(cooccurrences, cooccurrence_file) #outfile.close() logger.info("Cooccurrence list fetch complete (%i pairs).\n", cooccurrences.shape[0])
path = bugPath with open(path, 'r') as file: count = 1 pairCount = 1 for line in file: url = line.split(" ")[0] fixcommit = line.split(" ")[1].strip() apiurl = url + "/commits/" + fixcommit headers = {'Authorization': 'token ' + tokens} remainingNumber = util.getRemaining(tokens) if int(remainingNumber) < 5: while int(remainingNumber) < 5000: time.sleep(100) remainingNumber = util.getRemaining(tokens) print("remainig number is " + remainingNumber + "not enough limit, sleep a while") login = requests.get(apiurl, headers=headers) apiinfor = login.json() count = count + 1 if "url" in apiinfor: sys.stdout.write("token remaining:" + remainingNumber + " ") print(partation + " pairCount " + str(pairCount) + " " + partation + " " + tokenNumber) eachPairPath = clonePath + "/" + str(pairCount) util.createPath(eachPairPath) getInfoFromFixCommit(apiinfor, fixcommit, eachPairPath, url) pairCount = pairCount + 1 print(partation + " archiveInfo.txt DONE!!!!")
app_conf = load(file(pjoin(configurationDirectory, "..\\settings\\app_config.yaml"), u'r'), Loader=Loader) app_conf.get('view_server')['address'] = view_server_info['ip'] app_conf.get('view_server')['port'] = view_server_info['port'] DataCollectionRuntime.results_root_folder = \ app_conf.get('results_root_folder') if not (pexists(DataCollectionRuntime.results_root_folder) and pisdir(DataCollectionRuntime.results_root_folder)): DataCollectionRuntime.results_root_folder = abspath(pjoin( DataCollectionRuntime.script_dir, DataCollectionRuntime.results_root_folder)) util.createPath(DataCollectionRuntime.results_root_folder) cmd = None app_conf = None elif cmd == 'START_EXP_SESSION': if runtime: runtime.close() runtime._close() runtime = None ### Update App Config Settings for this exp. session ### # # Read the default app config yaml file # app_conf_path = pjoin(configurationDirectory, "..\\settings\\app_config.yaml") app_conf = load(file(app_conf_path, u'r'), Loader=Loader)
def train_model_each_cluster(args,cluster_size,embtype): logger.info("Cluster Size: {}".format(cluster_size)) args.aspect_size = cluster_size if args.seed > 0: np.random.seed(args.seed) aspect_file_name = config.aspect_file_name[args.lang].format(args.emb_technique, config.word_emb_training_type, args.epochs, config.filter_word_on, embtype, cluster_size) model_path = config.model_param_file[args.lang].format(args.emb_technique, config.word_emb_training_type, args.epochs, config.filter_word_on) util.createPath(aspect_file_name) vocab, train_x, overall_maxlen = dataset.get_data(vocab_size=args.vocab_size, maxlen=args.maxlen, lang=args.lang) train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen) print('Number of training examples: ', len(train_x)) print('Length of vocab: ', len(vocab)) optimizer = get_optimizer(args.algorithm) logger.info('Building {} based model for {}'.format(args.emb_technique, args.lang)) model = create_model(args, overall_maxlen, vocab) # freeze the word embedding layer model.get_layer('word_emb').trainable = False model.compile(optimizer=optimizer, loss=util.max_margin_loss, metrics=[util.max_margin_loss]) logger.info("-" * 80) vocab_inv = {} for w, ind in vocab.items(): vocab_inv[ind] = w sen_gen = sentence_batch_generator(train_x, args.batch_size) neg_gen = negative_batch_generator(train_x, args.batch_size, args.neg_size) batches_per_epoch = len(train_x) // args.batch_size min_loss = float('inf') for ii in range(args.epochs): t0 = time() loss, max_margin_loss = 0., 0. for b in tqdm(range(batches_per_epoch)): sen_input = next(sen_gen) neg_input = next(neg_gen) batch_loss, batch_max_margin_loss = model.train_on_batch([sen_input, neg_input], np.ones((args.batch_size, 1))) loss += batch_loss / batches_per_epoch max_margin_loss += batch_max_margin_loss / batches_per_epoch tr_time = time() - t0 if loss < min_loss: min_loss = loss word_emb = K.get_value(model.get_layer('word_emb').embeddings) aspect_emb = K.get_value(model.get_layer('aspect_emb').W) word_emb = word_emb / np.linalg.norm(word_emb, axis=-1, keepdims=True) aspect_emb = aspect_emb / np.linalg.norm(aspect_emb, axis=-1, keepdims=True) aspect_file = open(aspect_file_name, 'wt', encoding='utf-8') model.save(model_path) for ind in range(len(aspect_emb)): desc = aspect_emb[ind] sims = word_emb.dot(desc.T) ordered_words = np.argsort(sims)[::-1] desc_list = [vocab_inv[w] + "|" + str(sims[w]) for w in ordered_words[:50]] # print('Aspect %d:' % ind) # print(desc_list) aspect_file.write('Aspect %d:\n' % ind) aspect_file.write(' '.join(desc_list) + '\n\n') per_cluster_train_loss = loss logger.info('Epoch %d, train: %is' % (ii, tr_time)) logger.info( 'Total loss: %.4f, max_margin_loss: %.4f, ortho_reg: %.4f' % ( loss, max_margin_loss, loss - max_margin_loss)) return per_cluster_train_loss
remainingNumber = util.getRemaining(tokenKey) if int(remainingNumber) < 5: while int(remainingNumber) < 5000: time.sleep(100) remainingNumber = util.getRemaining(tokenKey) print("remainig number is " + remainingNumber + "not enough limit, sleep a while") requestFrombuggy = requests.get(buggyUrls[i], headers=headers).json() requestFromfix = requests.get(fixUrls[i], headers=headers).json() print("github key remaining rate limit " + remainingNumber) print(eachPairPath + " " + tokenNumber + "," + str(len(allfiles)) + " files") if "download_url" in requestFrombuggy and "download_url" in requestFromfix: if requestFrombuggy["download_url"] is not None: buggyDownUrl = requestFrombuggy["download_url"] fixDownUrl = requestFromfix["download_url"] buggyversionPath = eachPairPath + "/buggy-version" fixedversionPath = eachPairPath + "/fixed-version" util.createPath(buggyversionPath) util.createPath(fixedversionPath) commonpath = buggyUrls[i].split("contents/")[1].split( "?ref=")[0] util.cdAndWget(buggyDownUrl, buggyversionPath, commonpath.replace("/", ".")) util.cdAndWget(fixDownUrl, fixedversionPath, commonpath.replace("/", ".")) print("Buggy and fixed versions DONE!!!!!")