Beispiel #1
0
 def teste1N(self, diretorioSusp, nomeSusp):
     '''
     Testa um documento suspeito para todos os fontes do diretorio da classe
     '''
     corp = c.Corpus(self.diretorio)
     corp.carregarDiretorio()
     doc = self.buscarArquivo(diretorioSusp, nomeSusp)
     docsBasePlagio = corp.verificaPlagio(doc, 0.01)
     return self.salvarLogSaida(docsBasePlagio, nomeSusp)
Beispiel #2
0
 def teste11(self, diretorioSusp, nomeSusp, nomeFonte):
     '''
      Testa um documento suspeito para um fonte cujo nome informado se encontra no diretorio da classe
     '''
     corp = c.Corpus(self.diretorio)
     docFonte = corp.carregarDoc(self.diretorio + nomeFonte, nomeSusp)
     corp.lDocumentos.anexar(docFonte)
     doc = self.carregarDoc(diretorioSusp + nomeSusp, nomeSusp)
     docsBasePlagio = corp.verificaPlagio(doc, 0.01)
     return self.salvarLogSaida(docsBasePlagio, nomeSusp)
Beispiel #3
0
def score_authors(author_list, abstract):
    """
	Scores a list of authors against a given abstract
	:param author_list: A list of authors populated with papers
	:param abstract: Abstract to be scored against
	:return:
	"""
    # create corpus from query words
    docs = {}
    cachedStopWords = stopwords.words("english")
    query = TextBlob(abstract.lower())
    docs[-1] = query
    corpWords = []
    for word in query.words:
        if word not in cachedStopWords and word not in corpWords:
            corpWords.append(word)
    # construct tf-idf vectors from documents
    maxCitations = 0
    for author in author_list:
        for paper in author.papers:
            if paper.citations > maxCitations:
                maxCitations = paper.citations
            if paper.id not in docs.keys():
                docs[paper.id] = TextBlob(paper.desc.lower())
    corpus = Corpus(docs, corpWords)
    corpus.constructVectors()

    # cosine similarity
    query = corpus.scoredDocs[0].vector

    # original doc has id of -1
    for doc in corpus.scoredDocs:
        if doc.id == -1:
            query = doc.vector
    docDict = {}
    for document in corpus.scoredDocs:
        sim = cosine_sim(query, document.vector)
        document.addScore(sim)
        docDict[document.id] = sim

    for author in author_list:
        author.setCosineSimilarity(docDict)
        author.scorePapers(maxCitations)
        author.papers.sort(key=lambda paper: paper.finalScore, reverse=True)
        author.scoreAuthor()
Beispiel #4
0
def re_gen(dataset, type, id):
    corpus = Corpus(config['CORPUS'][dataset], dataset)
    tmp_dir = f'./tmp/{dataset}/{type}/{id}'
    create_dir(tmp_dir)

    def get_random_corpus_file(type):
        original_file_path = random.sample(
            glob.glob(
                os.path.join(get_repo_dir(dataset),
                             f'./{type}/*/*-orig.java')), 1)[0]
        original_file_name = original_file_path.split('/')[-1].split(
            '-orig')[0] + '.java'
        tmp_original_path = os.path.join(tmp_dir, original_file_name)
        shutil.copy(original_file_path, tmp_original_path)
        return (original_file_name, '', tmp_original_path)

    gen_errored(corpus, get_random_corpus_file, dataset, type, id,
                get_repo_dir(dataset))
def analys(corpus_name):

    corpus = Corpus(corpus_path[corpus_name], corpus_name)
    corpus.get_data()

    path = "plot/"
    X, Y = corpus.data
    print("size", len(Y))

    circle_plot(Histograme(corpus),
                path + "/" + corpus_name + "/",
                title=corpus_name + " : distribution of relationships")

    st = get_stop_words('en')
    st.extend(string.punctuation)
    st.extend([str(i) for i in range(10)])

    def rm_stop_words(dic):
        for i in st:
            if i in dic:
                dic[i] = 0
        return dic

    vocab, vocab_rel = get_vocab(corpus)
    vocab[''] = 0
    vocab = rm_stop_words(vocab)
    H = pd.DataFrame.from_dict(vocab, orient='index').nlargest(20,
                                                               0).to_dict()[0]
    histo(H, path + "/" + corpus_name + "/", title=corpus_name + " Histo")

    for i in get_rel_class(corpus):
        vocab = vocab_rel[i]
        vocab[''] = 0
        vocab = rm_stop_words(vocab)
        for k in H:
            if k in vocab:
                vocab[k] = 0
        Hi = pd.DataFrame.from_dict(vocab,
                                    orient='index').nlargest(20,
                                                             0).to_dict()[0]
        histo(Hi,
              path + "/" + corpus_name + "/",
              title=corpus_name + " relation " + i + " Histo")

    dist = Dist(corpus)
    box(dist, path + "/" + corpus_name + "/", title=corpus_name + " distances")

    dist = Dist(corpus)
    mean_frame, std = [], []
    for rel in dist.keys():
        df = pd.DataFrame.from_dict({rel: dist[rel]})
        mean_frame.append(df.mean())
        std.append(df.std())

    mean = pd.DataFrame(pd.concat(mean_frame), columns=["mean"])
    std = pd.DataFrame(pd.concat(std), columns=["std"])
    res = pd.concat((mean, std), axis=1)

    data = {'sentence length': [], 'Vocab': [], 'tokenisation length': []}
    tokenizer_bert, _ = get_bert()
    tokenizer_scibert, _ = get_bert(bert_type='scibert')

    for x in X:
        data['sentence length'].append(len(x[0].split(' ')))
        data['Vocab'].append('BERT VOCAB')
        data['tokenisation length'].append(len(tokenizer_bert.tokenize(x[0])))

        data['sentence length'].append(len(x[0].split(' ')))
        data['Vocab'].append('SciBERT VOCAB')
        data['tokenisation length'].append(
            len(tokenizer_scibert.tokenize(x[0])))

    data = pd.DataFrame(data)
    data = data.sort_values(by=['sentence length'])
    print(data)

    title = corpus_name + " tokenisation analysis"

    plt.rcParams["figure.figsize"] = (9, 9)

    pylab.mpl.style.use('seaborn')

    g = sns.relplot(x="sentence length",
                    y="tokenisation length",
                    hue="Vocab",
                    style="Vocab",
                    hue_order=['SciBERT VOCAB', 'BERT VOCAB'],
                    kind="line",
                    data=data,
                    col_order=['SciBERT VOCAB', 'BERT VOCAB'],
                    style_order=['SciBERT VOCAB', 'BERT VOCAB'])
    sns.despine()
    plt.title(title)

    plt.show()
    plt.savefig(title + ".png")
Beispiel #6
0
def train():
    print "train"
    start_time = time.time()
    config = SiameseTCNNConfig()
    corpus = Corpus(train_file, vocab_file, 0.0, config.seq_length,
                    config.vocab_size)
    testcorpus = Corpus(test_file, vocab_file, 1.0, config.seq_length,
                        config.vocab_size)
    print(corpus)
    print(testcorpus)

    config.vocab_size = len(corpus.words)

    train_data = TensorDataset(torch.LongTensor(corpus.x_train1),
                               torch.LongTensor(corpus.x_train2),
                               torch.FloatTensor(corpus.y_train))
    test_data = TensorDataset(torch.LongTensor(testcorpus.x_test1),
                              torch.LongTensor(testcorpus.x_test2),
                              torch.FloatTensor(testcorpus.y_test))

    print('Configuring CNN model...')
    model = SiameseTextCNN(config)
    print(model)

    # optimizer and loss function
    # criterion = nn.CrossEntropyLoss(size_average=False)
    # criterion = torch.nn.BCELoss(reduce=False, size_average=False)
    if config.contra_loss:
        criterion = ContrastiveLoss()
    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

    # set the mode to train
    print("Training and evaluating...")
    best_F1 = 0.0

    for epoch in range(config.num_epochs):
        # load the training data in batch
        model.train()
        train_loader = DataLoader(train_data, batch_size=config.batch_size)
        ii = 0
        for x1_batch, x2_batch, y_batch in train_loader:
            ii += 1
            if ii % 100 == 0: print epoch, "batch", ii
            inputs1, inputs2, targets = Variable(x1_batch), Variable(
                x2_batch), Variable(y_batch)

            optimizer.zero_grad()
            outputs1, outputs2 = model(inputs1, inputs2)  # forward computation

            loss = criterion(outputs1, outputs2, targets)
            """
            todo
            """
            # backward propagation and update parameters
            loss.backward()
            optimizer.step()

        # evaluate on both training and test dataset

        print "epoch", epoch
        train_loss, train_F1 = evaluate(train_data, model, criterion)
        test_loss, test_F1 = evaluate(test_data, model, criterion)
        #print "train_loss:",train_loss

        if test_F1 > best_F1:
            # store the best result
            best_F1 = test_F1
            improved_str = '*'
            torch.save(model.state_dict(), model_file)
        else:
            improved_str = ''

        time_dif = get_time_dif(start_time)
        msg = "Epoch {0:3}, Train_loss: {1:>7.3}, Train_F1 {2:>6.3%}, " \
              + "Test_loss: {3:>6.3}, Test_F1 {4:>6.3%}, Time: {5} {6}"
        print(
            msg.format(epoch + 1, train_loss, train_F1, test_loss, test_F1,
                       time_dif, improved_str))
Beispiel #7
0
        except:
            author = i['author']['name']
        txt = i['title'] + ". " + i['summary']
        txt = txt.replace('\n', ' ')
        txt = txt.replace('\r', ' ')
        try:
            coAuth = [aut['name'] for aut in i['author']][1:]
        except:
            coAuth = "Pas de Co-Auteur"
        doc = Document.ArxivDocument(datet, i['title'], author, txt, i['id'],
                                     coAuth)
        corpus_Arxiv.add_doc(doc)


#Initialisation des corpus
corpus_Reddit = Corpus.Corpus("Corona_red")
corpus_Arxiv = Corpus.Corpus("Corona_arx")

#Chargement des données dans les corpus
loadArxiv(corpus_Arxiv)
loadReddit(corpus_Reddit)

#Affichage du nombre de documents et d'auteurs
print("Création du corpus Reddit, %d documents et %d auteurs" %
      (corpus_Reddit.ndoc, corpus_Reddit.naut))
print("Création du corpus Arxiv, %d documents et %d auteurs" %
      (corpus_Arxiv.ndoc, corpus_Arxiv.naut))

print()

#Enregistrement des corpus
Beispiel #8
0
import Corpus
import numpy as np

import IBM1
import IBM2
import HMM

print("loading the corpus...")
corpus = Corpus.Corpus("eutrans/training", separator="#")
#corpus = Corpus.Corpus("corpus.txt", separator="---")
corpus.print_corpus_description()
print("...done")

#%% Testing IBM1
# print(" ")
# print("*"*50)
# print(" ")
# print("Building IBM1 item...")
# ibm1 = IBM1.IBM1(corpus)
# print("...done")
# print("starting to train IBM1...")
# ibm1_nb_training_step = 10
# imb1perplexityevol = ibm1.train(ibm1_nb_training_step, verbose=True)
# print("...done")
#
# print "\nIBM1 perplexity : ",ibm1.get_perplexity(),"\n"
#
# f2e = np.argmax(ibm1.proba_f_knowing_e,axis=1)
# print "IBM1 Translations :"
# for i in range(len(corpus.french_words)):
#     print corpus.french_words[i], " --> ", corpus.english_words[f2e[i]]
Beispiel #9
0
from glove import Glove
import Corpus

corpus = Corpus()

sent_token = [["안녕", "하세요"], ["지니티토리", "입니다"]]

corpus.fit(sent_token, window=20)

# model
glove = Glove(no_components=128, learning_rate=0.01)
glove.fit(corpus.matrix, epochs=50, no_threads=4, verbose=False)
glove.add_dictionary(corpus.dictionary)

# save
glove.save(DATA_DIR + '/glove_w20_epoch50.model')
Beispiel #10
0
from Corpus import *

if __name__ == '__main__':

    #Read corpus
    #corp_path = raw_input("Please input the path of the corpus:\n")
    #train_file = raw_input("Please input the filename of the training data:\n")
    #gold_file = raw_input("Please input the filename of the gold label:\n")
    train_file = "trail.csv"
    gold_file = "trial.labels"
    corpus = Corpus(train_file)
    #corpus.readCourpus()

    #training part.
    '''..to be complete
	
	'''

    predict_file = "trial.predict"

    #Evaluation
    if (corpus.gold_file != gold_file):
        corpus.readGold(gold_file)
    if (corpus.predict_file != predict_file):
        corpus.readPrediction(predict_file)
    corpus.evaluation()
    corpus.print_result()
Beispiel #11
0
        return (original_file_name, '', tmp_original_path)

    gen_errored(corpus, get_random_corpus_file, dataset, type, id,
                get_repo_dir(dataset))


if __name__ == '__main__':
    if sys.argv[2] == 'all':
        dataset_list = list_folders(get_repo_dir(''))
    else:
        dataset_list = sys.argv[2:]

    if len(sys.argv) >= 2 and sys.argv[1] == 'run':
        corpora = []
        for corpus in sys.argv[2:]:
            corpora.append(Corpus(config['CORPUS'][corpus], corpus))
        share = {
            key: config['DATASHARE'].getint(key)
            for key in ['learning', 'validation', 'testing']
        }
        for corpus in corpora:
            gen_dataset(corpus, share)
    if len(sys.argv) >= 2 and sys.argv[1] == 'exp':
        for dataset in tqdm(dataset_list, desc='datasets'):
            target = get_experiment_dir(dataset)
            if not os.path.exists(target):
                gen_experiment(dataset)
            run_experiment(dataset)
    if len(sys.argv) >= 2 and sys.argv[1] == 'exp-cs':
        results = {}
        for dataset in dataset_list:
Beispiel #12
0
np.random.seed(SEED)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from gensim.models import word2vec

custom_embedding = WordEmbeddings('pathtotoEmbeddings.vec')

# now create the StackedEmbedding object that combines all embeddings
stacked_embeddings = StackedEmbeddings(embeddings=[
    custom_embedding
])  # , charlm_embedding_forward,charlm_embedding_backward])

dataset_dict = Dataset_load.load()

corp = Corpus.Corpus(dataset_dict,
                     embeddings_file_path=None,
                     stacked_embeddings=stacked_embeddings)

model_params = {
    "filter_width": 3,
    "embeddings_dropout": True,
    "n_filters": [256],
    "dense_dropout": True,
    "token_embeddings_dim": 300,
    "char_embeddings_dim": 50,
    "cell_type": 'lstm',
    "use_batch_norm": True,
    "concat_embeddings": True,
    "use_crf": True,
    "use_char_embeddins": True,
    "net_type": 'rnn',