def test_reference_list_ids(single_article_path): expected = [ '000303524000001.127', 'WOS:000074141300003', 'WOS:000076444100008', 'WOS:000084608600019', 'WOS:000169756700007', 'WOS:000185914300012', 'WOS:000264798000005.30', 'WOS:A1997XH27400004' ] ref_list = ArticleCollection(single_article_path).reference_list() assert sorted(ref_list.ids()) == expected
def test_source_title_matching(articles_sample, output_filepath): journal_titles = [ "THEORETICAL FOUNDATIONS OF CHEMICAL ENGINEERING", "DOWN BEAT" ] matches = ArticleCollection(articles_sample).select( SourceTitleMatcher(journal_titles), output_filepath) assert sum(1 for article in matches) == 2
def test_id_matching(articles_sample, output_filepath): ids = [ "WOS:000251423400047", "WOS:000249481100010", "WOS:not-in-sample-set" ] matches = ArticleCollection(articles_sample).select( IdMatcher(ids), output_filepath) assert sum(1 for article in matches) == 2 assert sum(1 for line in open(output_filepath)) == len(ids) - 1
def test_affiliation_matching_from_list(affiliated_sample, output_filepath): uwmadison_names = [ "University of Wisconsin Madison", "Univ Wisconsin", "University Wisconsin Health" ] matches = ArticleCollection(affiliated_sample).select( AffiliationMatcher(uwmadison_names), output_filepath) assert sum(1 for article in matches) == 1
def articles_sample_reflist(tmpdir, articles_sample): return ArticleCollection(articles_sample).reference_list()
print("Find by WOS Export - Processing Data for Year:", year, cluster_id, process_id) finder_logging.configure(cluster_id, process_id) match_collections = [] ids = [ article['UT'] for article in csv.DictReader(open(exported_file), delimiter='\t') ] criteria = IdMatcher(ids) for input_file in input_files: output_file = os.path.splitext( os.path.basename(input_file))[0] + "-article-matches.json" match_collections.append( ArticleCollection(input_file).select(criteria, output_file)) article_match_count = 0 reference_count = 0 for matched_collection in match_collections: with open(references_file, 'a') as file: for article in matched_collection: article_match_count += 1 for reference in article.references(): reference_count += 1 year = reference['year'] if reference[ 'year'] is not None else 'BLANK' id = reference['id'] if reference['id'] is not None else '' file.write('\t'.join([year, id]) + '\n') print("Articles: ", article_match_count)
start = datetime.today() output_dir = sys.argv[1] year = sys.argv[2] # A lookup table that maps output_files output_files = {} input_files = glob.glob(output_dir + "/*.json") article_count = 0 reference_count = 0 wos_rec_refs = 0 # Loop thru the article collection and write the references out for input_file in input_files: print(input_file) for article in ArticleCollection(input_file): article_count += 1 for reference in article.references(): reference_count += 1 if reference['id'] is not None and matches_source_item( reference['id']): wos_rec_refs += 1 # First Hash the ID of the referenced article. This will help ensure that the output # files are evenly distributed in size. hash_id = hashlib.sha1(bytearray(reference['id'], "utf-8")).hexdigest() file = get_file_bin(output_files, hash_id, output_dir, year) # Next write the entry to the output file corresponding to the current hash ID. json_str = json.dumps({
def test_reference_list_years(single_article_path): expected = ['1996', '1997', '1998', '1999', '2000', '2001', '2003'] ref_list = ArticleCollection(single_article_path).reference_list() assert sorted(ref_list.years()) == expected
def test_article_parsing(articles_sample): assert sum(1 for article in ArticleCollection(articles_sample)) == 10
def test_case_insensitive_phrase_matching(articles_sample, output_filepath): matches = ArticleCollection(articles_sample).select( PhraseMatcher('baker'), output_filepath) assert sum(1 for article in matches) == 1
def test_citation_matching(articles_sample, output_filepath): ids = ["WOS:000084608600019"] matches = ArticleCollection(articles_sample).select( CitationMatcher(ids), output_filepath) assert sum(1 for article in matches) == 2 assert sum(1 for line in open(output_filepath)) == 2
start = datetime.today() year = sys.argv[1] cluster_id = sys.argv[2] process_id = sys.argv[3] input_files = glob.glob("data/*.json") article_file = "articles.csv" print( "Find Citing Records - Processing Data for Year:", year, cluster_id, process_id ) finder_logging.configure(cluster_id, process_id) ids = set() with open(article_file) as csv_file: csv_reader = csv.DictReader(csv_file) for row in csv_reader: ids.add(row["ID"]) for input_file in input_files: output_file = "output/" + os.path.splitext(os.path.basename(input_file))[0] + "-citing-records.json" ArticleCollection(input_file).select(CitationMatcher(ids), output_file) finish = datetime.today() print( "Started: ", start ) print( "Finished: ", finish ) print( "Time Spent:", finish - start, "\n" )