Beispiel #1
0
def main(configurationDirectory):
    pjoin = os.path.join
    abspath = os.path.abspath
    psplit = os.path.split
    pexists = os.path.exists
    pisdir = os.path.isdir

    app_conf = load(file(os.path.join(configurationDirectory,
                                    "..\\settings\\app_config.yaml"), u'r'),
                                    Loader=Loader)

    view_server_info={}

    print
    print "Confero Track Started. "

    if app_conf.get('view_server',{}).get('address').lower() in [None, 'auto', 'bonjour']:
        view_server_info = findConferoViewServer()
        print "Found Confero Server via Bonjour:", view_server_info['ip'], view_server_info['port']
        print
        app_conf.get('view_server')['address'] = view_server_info['ip']
        app_conf.get('view_server')['port'] = view_server_info['port']
    else:
        view_server_info['ip'] = app_conf.get('view_server')['address']
        view_server_info['port'] = app_conf.get('view_server')['port']
        print "Using app_config settings for Confero Server connection:", view_server_info['ip'], view_server_info['port']
        print

    DataCollectionRuntime.view_server_ip = view_server_info['ip']

    DataCollectionRuntime.results_root_folder = \
                                    app_conf.get('results_root_folder')
    if not (pexists(DataCollectionRuntime.results_root_folder)
            and pisdir(DataCollectionRuntime.results_root_folder)):
        DataCollectionRuntime.results_root_folder = abspath(pjoin(
            DataCollectionRuntime.script_dir,
            DataCollectionRuntime.results_root_folder))
    util.createPath(DataCollectionRuntime.results_root_folder)

    try:
        ws = createWebsocketInterface(app_conf)
    except socket.error, e:
        if e.errno == 10035:
            pass
        elif e.errno in [10054,10061]:
            print('WebSocket could not be connected to feedback server. '
                  'Is the server program running?')
            return 'EXIT_PROGRAM'
        else:
            raise e
Beispiel #2
0
def plot_graph(cluster_range, all_clust_err, args):
    if args.emb_technique == "fasttext":
        embtype = config.fasttext_method+"_"+str(args.emb_dim)
    else:
        embtype = args.emb_dim
    file_name = config.image_path['file_name'].format(embtype, args.clust_range)
    plt.plot(cluster_range, all_clust_err)
    plt.title("Error trend for embedding size {}, embedding algorithm {}".format(args.emb_dim,args.emb_technique))
    plt.xlabel("Number of clusters")
    plt.ylabel("Clustering Error")
    path = config.image_path[args.lang].format(args.emb_technique, config.word_emb_training_type, args.epochs, config.filter_word_on, file_name)
    util.createPath(path)
    plt.savefig(path)
    plt.show()
 def __init__(self, inFileName, outFile, lang='EN'):
     self.lang = lang
     self.inFile = codecs.open(inFileName, 'r', 'utf-8')
     util.createPath(outFile)
     self.outFile = codecs.open(outFile, 'w', 'utf-8')
     if lang == "EN":
         self.lmtzr = WordNetLemmatizer()
         self.stop = stopwords.words('english')
         self.replacement_dict = DataCorrection.replacement_dict
         self.relevant_terms = DataCorrection.relevant_terms
         self.contract = DataCorrection.contraction
     elif lang == "DE":
         self.spacy_model_de = spacy.load('de')
         german_stop = stopwords.words('german')
         self.stop = [self.umlauts(word) for word in german_stop]
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-emb_name", "--emb_name", dest="emb_name", type=str, metavar='<str>', default='w2v',
                        help="Name the embedding algorithm(w2v or fasttext")
    parser.add_argument("-ft_method", "--fasttext_emb_method", dest="fasttext_emb_method", type=str, metavar='<str>', default='skipgram',
                        help="Name the fasttext embedding algorithm(skipgram or cbow")
    parser.add_argument("-emb_dim", "--emb_dim", dest="emb_dim", type=int, default=200,
                        help="embedding dimension value")
    parser.add_argument("-sf", "--source-file", dest="source_file", type=str, metavar='<str>',
                        help="Name the source file")
    parser.add_argument("-gv_vocab", "--glove_vocab", dest="glove_vocab", type=str, metavar='<str>',
                        help="Name the vocab file")
    parser.add_argument("-gv_cooccur", "--gv_cooccur", dest="glove_cooccurence", type=str, metavar='<str>',
                        help="Name the cooccurence file")
    parser.add_argument("-lang", "--lang", dest="lang", type=str, metavar='<str>', default='en',
                        help="Name the language (German/English")
    args = parser.parse_args()
    source = config.data_source[args.lang].format(config.filter_word_on)
    if args.emb_name == "w2v":
        print('training word embeddings for {}...'.format(args.lang))
        emb_dim=args.emb_dim
        if args.lang == 'en':
            model_file = config.emb_dir_en['w2v'].format(config.word_emb_training_type)+'/w2v_embedding_' + str(emb_dim)
        elif args.lang == 'de':
            model_file = config.emb_dir_de['w2v'].format(config.word_emb_training_type)+'/w2v_embedding_' + str(emb_dim)
        model=w2v(source,emb_dim)
        createPath(model_file)
        model.save(model_file)
    elif args.emb_name=="fasttext":
        print('training fasttext word embeddings for {}...'.format(args.lang))
        method = args.fasttext_emb_method
        emb_dim = args.emb_dim
        model = fasttext_embedding(source, method, emb_dim)
        if args.lang == 'en':
            model_file = config.emb_dir_en['fasttext'].format(config.word_emb_training_type)+'/w2v_embedding_'+args.fasttext_emb_method+"_"+str(emb_dim)
        elif args.lang == 'de':
            model_file = config.emb_dir_de['fasttext'].format(config.word_emb_training_type)+'/w2v_embedding_'+args.fasttext_emb_method+"_"+str(emb_dim)
        createPath(model_file)
        model.save(model_file)
    elif args.emb_name == 'glove':
        logger.info('Generate glove embedding...')
        glove_embedding(source, args.glove_vocab, args.glove_cooccurence, args.lang)
    else:
        print("Wrong embedding name")
Beispiel #5
0
def glove_embedding(filename, vocab_file, cooccurence_file, lang):
    gv = Glove()
    if vocab_file and cooccurence_file:
        vocab = gv.load_vocab_in_order(vocab_file)
        cooccurence = gv.load_cooccurence_matrix(cooccurence_file)
        logger.info('get pre-trained glove embedding')
        original_embedding = gv.get_original_embedding(config.glove_pretrained_emb[lang])
        mittens_model = Mittens(n=300, max_iter=1000)
        logger.info('Start fine tuning...')
        new_embeddings = mittens_model.fit(cooccurence, vocab=vocab,
        initial_embedding_dict=original_embedding)
        fin = open(config.glove_fine_tuned_emb[lang], 'wb')
        pickle.dump(new_embeddings, fin)
        fin.close()
        logger.info('Fine tuning complete')
    else:
        if lang == 'de':
            logger.info('Load german data')
        elif lang == 'en':
            logger.info('Load english data')
        fin = codecs.open(filename, 'r', 'utf-8')
        corpus = []
        for line in fin:
            corpus.append(line)
        vocab = gv.build_vocab(corpus)
        vocab_file = config.glove_fine_tuned_vocab[lang]
        createPath(vocab_file)
        outfile = open(vocab_file, 'wb')
        pickle.dump(vocab, outfile)
        outfile.close()
        logger.info("Fetching cooccurrence list..")
        cooccurrences = gv.build_cooccur(vocab, corpus)
        cooccurrences = gv.convert_cooccurence_matrix(cooccurrences, len(vocab))
        cooccurrence_file = config.glove_fine_tuned_cooccurance[lang]
        #outfile = open(cooccurrence_file, 'wb')
        joblib.dump(cooccurrences, cooccurrence_file)
        #outfile.close()
        logger.info("Cooccurrence list fetch complete (%i pairs).\n", cooccurrences.shape[0])
Beispiel #6
0
path = bugPath
with open(path, 'r') as file:
    count = 1
    pairCount = 1
    for line in file:
        url = line.split(" ")[0]
        fixcommit = line.split(" ")[1].strip()
        apiurl = url + "/commits/" + fixcommit
        headers = {'Authorization': 'token ' + tokens}
        remainingNumber = util.getRemaining(tokens)

        if int(remainingNumber) < 5:
            while int(remainingNumber) < 5000:
                time.sleep(100)
                remainingNumber = util.getRemaining(tokens)
                print("remainig number is " + remainingNumber +
                      "not enough limit, sleep a while")
        login = requests.get(apiurl, headers=headers)
        apiinfor = login.json()
        count = count + 1
        if "url" in apiinfor:
            sys.stdout.write("token remaining:" + remainingNumber + " ")
            print(partation + " pairCount " + str(pairCount) + " " +
                  partation + " " + tokenNumber)
            eachPairPath = clonePath + "/" + str(pairCount)
            util.createPath(eachPairPath)
            getInfoFromFixCommit(apiinfor, fixcommit, eachPairPath, url)
            pairCount = pairCount + 1

print(partation + " archiveInfo.txt DONE!!!!")
Beispiel #7
0
            app_conf = load(file(pjoin(configurationDirectory,
                                            "..\\settings\\app_config.yaml"), u'r'),
                                            Loader=Loader)

            app_conf.get('view_server')['address'] = view_server_info['ip']
            app_conf.get('view_server')['port'] = view_server_info['port']

            DataCollectionRuntime.results_root_folder = \
                                            app_conf.get('results_root_folder')
            if not (pexists(DataCollectionRuntime.results_root_folder)
                    and pisdir(DataCollectionRuntime.results_root_folder)):
                DataCollectionRuntime.results_root_folder = abspath(pjoin(
                    DataCollectionRuntime.script_dir,
                    DataCollectionRuntime.results_root_folder))
            util.createPath(DataCollectionRuntime.results_root_folder)

            cmd = None
            app_conf = None
        elif cmd == 'START_EXP_SESSION':
            if runtime:
                runtime.close()
                runtime._close()
                runtime = None
            ### Update App Config Settings for this exp. session ###
            #
            # Read the default app config yaml file
            #
            app_conf_path = pjoin(configurationDirectory, "..\\settings\\app_config.yaml")
            app_conf = load(file(app_conf_path, u'r'), Loader=Loader)
Beispiel #8
0
def train_model_each_cluster(args,cluster_size,embtype):
    logger.info("Cluster Size: {}".format(cluster_size))
    args.aspect_size = cluster_size
    if args.seed > 0:
        np.random.seed(args.seed)

    aspect_file_name = config.aspect_file_name[args.lang].format(args.emb_technique, config.word_emb_training_type, args.epochs, config.filter_word_on, embtype, cluster_size)
    model_path = config.model_param_file[args.lang].format(args.emb_technique, config.word_emb_training_type, args.epochs, config.filter_word_on)
    util.createPath(aspect_file_name)

    vocab, train_x, overall_maxlen = dataset.get_data(vocab_size=args.vocab_size,
                                                      maxlen=args.maxlen, lang=args.lang)
    train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen)
    print('Number of training examples: ', len(train_x))
    print('Length of vocab: ', len(vocab))

    optimizer = get_optimizer(args.algorithm)
    logger.info('Building {} based model for {}'.format(args.emb_technique, args.lang))
    model = create_model(args, overall_maxlen, vocab)
    # freeze the word embedding layer
    model.get_layer('word_emb').trainable = False
    model.compile(optimizer=optimizer, loss=util.max_margin_loss, metrics=[util.max_margin_loss])

    logger.info("-" * 80)

    vocab_inv = {}
    for w, ind in vocab.items():
        vocab_inv[ind] = w

    sen_gen = sentence_batch_generator(train_x, args.batch_size)
    neg_gen = negative_batch_generator(train_x, args.batch_size, args.neg_size)
    batches_per_epoch = len(train_x) // args.batch_size

    min_loss = float('inf')
    for ii in range(args.epochs):
        t0 = time()
        loss, max_margin_loss = 0., 0.

        for b in tqdm(range(batches_per_epoch)):
            sen_input = next(sen_gen)
            neg_input = next(neg_gen)

            batch_loss, batch_max_margin_loss = model.train_on_batch([sen_input, neg_input],
                                                                     np.ones((args.batch_size, 1)))
            loss += batch_loss / batches_per_epoch
            max_margin_loss += batch_max_margin_loss / batches_per_epoch

        tr_time = time() - t0

        if loss < min_loss:
            min_loss = loss
            word_emb = K.get_value(model.get_layer('word_emb').embeddings)
            aspect_emb = K.get_value(model.get_layer('aspect_emb').W)
            word_emb = word_emb / np.linalg.norm(word_emb, axis=-1, keepdims=True)
            aspect_emb = aspect_emb / np.linalg.norm(aspect_emb, axis=-1, keepdims=True)
            aspect_file = open(aspect_file_name, 'wt', encoding='utf-8')
            model.save(model_path)
            for ind in range(len(aspect_emb)):
                desc = aspect_emb[ind]
                sims = word_emb.dot(desc.T)
                ordered_words = np.argsort(sims)[::-1]
                desc_list = [vocab_inv[w] + "|" + str(sims[w]) for w in ordered_words[:50]]
                # print('Aspect %d:' % ind)
                # print(desc_list)
                aspect_file.write('Aspect %d:\n' % ind)
                aspect_file.write(' '.join(desc_list) + '\n\n')

        per_cluster_train_loss = loss

        logger.info('Epoch %d, train: %is' % (ii, tr_time))
        logger.info(
            'Total loss: %.4f, max_margin_loss: %.4f, ortho_reg: %.4f' % (
                loss, max_margin_loss, loss - max_margin_loss))

    return per_cluster_train_loss
Beispiel #9
0
            remainingNumber = util.getRemaining(tokenKey)

            if int(remainingNumber) < 5:
                while int(remainingNumber) < 5000:
                    time.sleep(100)
                    remainingNumber = util.getRemaining(tokenKey)
                    print("remainig number is " + remainingNumber +
                          "not enough limit, sleep a while")

            requestFrombuggy = requests.get(buggyUrls[i],
                                            headers=headers).json()
            requestFromfix = requests.get(fixUrls[i], headers=headers).json()
            print("github key remaining rate limit " + remainingNumber)
            print(eachPairPath + " " + tokenNumber + "," + str(len(allfiles)) +
                  " files")
            if "download_url" in requestFrombuggy and "download_url" in requestFromfix:
                if requestFrombuggy["download_url"] is not None:
                    buggyDownUrl = requestFrombuggy["download_url"]
                    fixDownUrl = requestFromfix["download_url"]
                    buggyversionPath = eachPairPath + "/buggy-version"
                    fixedversionPath = eachPairPath + "/fixed-version"
                    util.createPath(buggyversionPath)
                    util.createPath(fixedversionPath)
                    commonpath = buggyUrls[i].split("contents/")[1].split(
                        "?ref=")[0]
                    util.cdAndWget(buggyDownUrl, buggyversionPath,
                                   commonpath.replace("/", "."))
                    util.cdAndWget(fixDownUrl, fixedversionPath,
                                   commonpath.replace("/", "."))
print("Buggy and fixed versions DONE!!!!!")