Ejemplo n.º 1
0
def _merge_samples(samples_list, sparse):
    if sparse:
        samples = vstack_sparse(samples_list)
    else:
        samples = np.vstack(samples_list)

    return samples, samples.shape[0]
Ejemplo n.º 2
0
def _concatenate_samples(sparse, *samples):
    if not sparse:
        return np.vstack(samples)
    else:
        return vstack_sparse(samples)
Ejemplo n.º 3
0
def add_articles(papers_data_path, new_papers_path, pdf_papers_path,
                 preprocessed_papers_path):
    """
        Add new articles to current articles base (adds all papers from new_papers_path to papers base and deletes them)
    :param papers_data_path: path to papers data (save all data here)
    :param new_papers_path: relative from papers_data_path path to new papers pdfs (add all papers from here)
    :param pdf_papers_path: relative from papers_data_path path to papers pdf (save pdf files here)
    :param preprocessed_papers_path: relative from papers_data_path path to papers preprocessed texts
    """
    if not os.path.exists(os.path.join(papers_data_path, new_papers_path)):
        os.mkdir(os.path.join(papers_data_path, new_papers_path))

    with open(os.path.join(papers_data_path, 'papers_index.pkl'), 'rb') as f:
        papers_index = pickle.load(f)

    with open(os.path.join(papers_data_path, 'tfidf_vectorizer.pkl'),
              'rb') as f:
        vectorizer = pickle.load(f)

    with open(os.path.join(papers_data_path, 'tfidf_matrix.pkl'), 'rb') as f:
        tfidf_data, words = pickle.load(f)

    with open(os.path.join(papers_data_path, 'dataset_with_articles.pkl'),
              'rb') as f:
        dataset_with_articles = pickle.load(f)

    with open(os.path.join(papers_data_path, 'wiki_datasets.pkl'), 'rb') as f:
        datasets = pickle.load(f)

    # index to new papers to start
    max_ind = max(papers_index.keys()) + 1

    for i, paper in enumerate(
            os.listdir(os.path.join(papers_data_path, new_papers_path))):
        paper_id = max_ind + i
        paper_name = paper.replace('.pdf', '')
        papers_index[paper_id] = paper_name
        paper_str = preprocess_text(
            pdf_to_str(os.path.join(papers_data_path, new_papers_path, paper)))

        with open(
                os.path.join(papers_data_path, preprocessed_papers_path,
                             paper_name + '.txt'), 'w') as f:
            f.write(paper_str)

        # update tfidf
        article_tfidf = vectorizer.transform([paper_str])
        tfidf_data = vstack_sparse([tfidf_data, article_tfidf])

        # move paper from new_papers_path to pdf path
        os.rename(os.path.join(papers_data_path, new_papers_path, paper),
                  os.path.join(papers_data_path, pdf_papers_path, paper))

        if type(paper_str) == str:
            evaluation_text = extract_evaluation_part(paper_str,
                                                      papers_data_path)

            if evaluation_text is not None:
                evaluation_text = evaluation_text.lower()

                for dataset in datasets:
                    search_dataset_in_paper = evaluation_text.find(dataset)

                    if search_dataset_in_paper != -1:
                        dataset_with_articles[dataset].append(paper_id)

    with open(os.path.join(papers_data_path, 'papers_index.pkl'), 'wb') as f:
        pickle.dump(papers_index, f)

    with open(os.path.join(papers_data_path, 'tfidf_matrix.pkl'), 'wb') as f:
        pickle.dump([tfidf_data, words], f)

    with open(os.path.join(papers_data_path, 'dataset_with_articles.pkl'),
              'wb') as f:
        pickle.dump(dataset_with_articles, f)