Esempio n. 1
0
def extract_citations():
    """
    Extract citations from the papers, based on the title of other papers
    """
    titles = dict((p.title.strip(), p.id) for p in Papers.select(Papers)
                  if len(p.title.strip().split(' ')) > 1)
    for paper in Papers.select():
        citations = [
            titles[t] for t in list(titles.keys())
            if t in paper.paper_text and titles[t] != paper.id
        ]
        for citation in citations:
            create_citation(paper.id, citation)
        print("Paper {paper_id}".format(paper_id=paper.id))
        print(citations)
def get_documents() -> list:
    """
    Given the documents in the option parser return the body of these documents
    Returns: list of corpus of bodies

    """
    paper = Papers.select().where(Papers.id == options.document).get()
    return paper.paper_text
Esempio n. 3
0
def main():
    papers = Papers.select().limit(LIMIT)
    n_citations = []
    for index, p in enumerate(papers[BEGIN:]):
        print(p.id - 1)

        n_citations.append(scrape_citation_count(p))
        if n_citations[index] == -1:
            print("Finished after {} iterations".format(index))
            break
        time.sleep(2)
    citations_to_csv(n_citations)
    print(n_citations)
Esempio n. 4
0
def main():
    if not os.path.isfile('ldamodel.pkl'):
        papers = [p.paper_text for p in Papers.select().order_by(fn.Random()).limit(200)]
        ldamodel, dictionary = train_classifier(papers, 20)
        pickle.dump(ldamodel, open('ldamodel.pkl', 'wb'))
        pickle.dump(dictionary, open('dictionary.pkl', 'wb'))
    else:
        ldamodel = pickle.load(open('ldamodel.pkl', 'rb'))
        dictionary = pickle.load(open('dictionary.pkl', 'rb'))
    if not os.path.isfile('labels.txt'):
        topic_labels = extract_topics(ldamodel)
        with open('labels.txt', 'w') as f:
            f.write("\n".join(topic_labels))
    else:
        with open('labels.txt', 'r') as f:
            topic_labels = f.read().splitlines()
    create_database_labels(topic_labels)
    print(topic_labels)
    label_documents(ldamodel, topic_labels, dictionary)
Esempio n. 5
0
def label_documents(model: LdaModel, topic_labels: list, dictionary: corpora.Dictionary):
    """
    Labels the documents in the database including the documents not present in the training set
    Args:
        model: the trained ldaModel
        topic_labels: list with labels of the topics
        dictionary: dictionary that is used when scanning the training data

    """
    papers = Papers.select()
    for paper in papers:
        text, title, paper_id = paper.paper_text, paper.title, paper.id
        cleaned_text = clean(text)
        text_dict = dictionary.doc2bow(cleaned_text)
        topic_scores = model[text_dict]
        labels = []
        for topic_score in topic_scores:
            if topic_score[1] > 1/len(topic_scores):
                Papers_labels.get_or_create(paper_id=paper_id, label_id=topic_score[0])
                labels.append(topic_labels[topic_score[0]])
        print(title, labels)
Esempio n. 6
0
def main():
    papers = Papers.select().limit(100)
    labels, data = [p.title for p in papers], [p.paper_text for p in papers]
    data_feat = extract_features(data)
    distance_matrix = create_distance_matrix(data_feat)
    cluster(distance_matrix, labels)
Esempio n. 7
0
def main():
    for paper in Papers.select().limit(10):
        citations = re.split('[Rr][Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee]([Ss])?',
                             paper.paper_text)[-1]
        print(citations.split('\n'))