def _merge_samples(samples_list, sparse): if sparse: samples = vstack_sparse(samples_list) else: samples = np.vstack(samples_list) return samples, samples.shape[0]
def _concatenate_samples(sparse, *samples): if not sparse: return np.vstack(samples) else: return vstack_sparse(samples)
def add_articles(papers_data_path, new_papers_path, pdf_papers_path, preprocessed_papers_path): """ Add new articles to current articles base (adds all papers from new_papers_path to papers base and deletes them) :param papers_data_path: path to papers data (save all data here) :param new_papers_path: relative from papers_data_path path to new papers pdfs (add all papers from here) :param pdf_papers_path: relative from papers_data_path path to papers pdf (save pdf files here) :param preprocessed_papers_path: relative from papers_data_path path to papers preprocessed texts """ if not os.path.exists(os.path.join(papers_data_path, new_papers_path)): os.mkdir(os.path.join(papers_data_path, new_papers_path)) with open(os.path.join(papers_data_path, 'papers_index.pkl'), 'rb') as f: papers_index = pickle.load(f) with open(os.path.join(papers_data_path, 'tfidf_vectorizer.pkl'), 'rb') as f: vectorizer = pickle.load(f) with open(os.path.join(papers_data_path, 'tfidf_matrix.pkl'), 'rb') as f: tfidf_data, words = pickle.load(f) with open(os.path.join(papers_data_path, 'dataset_with_articles.pkl'), 'rb') as f: dataset_with_articles = pickle.load(f) with open(os.path.join(papers_data_path, 'wiki_datasets.pkl'), 'rb') as f: datasets = pickle.load(f) # index to new papers to start max_ind = max(papers_index.keys()) + 1 for i, paper in enumerate( os.listdir(os.path.join(papers_data_path, new_papers_path))): paper_id = max_ind + i paper_name = paper.replace('.pdf', '') papers_index[paper_id] = paper_name paper_str = preprocess_text( pdf_to_str(os.path.join(papers_data_path, new_papers_path, paper))) with open( os.path.join(papers_data_path, preprocessed_papers_path, paper_name + '.txt'), 'w') as f: f.write(paper_str) # update tfidf article_tfidf = vectorizer.transform([paper_str]) tfidf_data = vstack_sparse([tfidf_data, article_tfidf]) # move paper from new_papers_path to pdf path os.rename(os.path.join(papers_data_path, new_papers_path, paper), os.path.join(papers_data_path, pdf_papers_path, paper)) if type(paper_str) == str: evaluation_text = extract_evaluation_part(paper_str, papers_data_path) if evaluation_text is not None: evaluation_text = evaluation_text.lower() for dataset in datasets: search_dataset_in_paper = evaluation_text.find(dataset) if search_dataset_in_paper != -1: dataset_with_articles[dataset].append(paper_id) with open(os.path.join(papers_data_path, 'papers_index.pkl'), 'wb') as f: pickle.dump(papers_index, f) with open(os.path.join(papers_data_path, 'tfidf_matrix.pkl'), 'wb') as f: pickle.dump([tfidf_data, words], f) with open(os.path.join(papers_data_path, 'dataset_with_articles.pkl'), 'wb') as f: pickle.dump(dataset_with_articles, f)