コード例 #1
0
def test_reference_list_ids(single_article_path):
    expected = [
        '000303524000001.127', 'WOS:000074141300003', 'WOS:000076444100008',
        'WOS:000084608600019', 'WOS:000169756700007', 'WOS:000185914300012',
        'WOS:000264798000005.30', 'WOS:A1997XH27400004'
    ]
    ref_list = ArticleCollection(single_article_path).reference_list()
    assert sorted(ref_list.ids()) == expected
コード例 #2
0
def test_source_title_matching(articles_sample, output_filepath):
    journal_titles = [
        "THEORETICAL FOUNDATIONS OF CHEMICAL ENGINEERING", "DOWN BEAT"
    ]
    matches = ArticleCollection(articles_sample).select(
        SourceTitleMatcher(journal_titles), output_filepath)
    assert sum(1 for article in matches) == 2
コード例 #3
0
def test_id_matching(articles_sample, output_filepath):
    ids = [
        "WOS:000251423400047", "WOS:000249481100010", "WOS:not-in-sample-set"
    ]
    matches = ArticleCollection(articles_sample).select(
        IdMatcher(ids), output_filepath)
    assert sum(1 for article in matches) == 2
    assert sum(1 for line in open(output_filepath)) == len(ids) - 1
コード例 #4
0
def test_affiliation_matching_from_list(affiliated_sample, output_filepath):
    uwmadison_names = [
        "University of Wisconsin Madison", "Univ Wisconsin",
        "University Wisconsin Health"
    ]
    matches = ArticleCollection(affiliated_sample).select(
        AffiliationMatcher(uwmadison_names), output_filepath)
    assert sum(1 for article in matches) == 1
コード例 #5
0
def articles_sample_reflist(tmpdir, articles_sample):
    return ArticleCollection(articles_sample).reference_list()
コード例 #6
0
print("Find by WOS Export - Processing Data for Year:", year, cluster_id,
      process_id)
finder_logging.configure(cluster_id, process_id)

match_collections = []
ids = [
    article['UT']
    for article in csv.DictReader(open(exported_file), delimiter='\t')
]
criteria = IdMatcher(ids)
for input_file in input_files:
    output_file = os.path.splitext(
        os.path.basename(input_file))[0] + "-article-matches.json"
    match_collections.append(
        ArticleCollection(input_file).select(criteria, output_file))

article_match_count = 0
reference_count = 0
for matched_collection in match_collections:
    with open(references_file, 'a') as file:
        for article in matched_collection:
            article_match_count += 1
            for reference in article.references():
                reference_count += 1
                year = reference['year'] if reference[
                    'year'] is not None else 'BLANK'
                id = reference['id'] if reference['id'] is not None else ''
                file.write('\t'.join([year, id]) + '\n')

print("Articles:  ", article_match_count)
コード例 #7
0
start = datetime.today()
output_dir = sys.argv[1]
year = sys.argv[2]

# A lookup table that maps output_files
output_files = {}
input_files = glob.glob(output_dir + "/*.json")

article_count = 0
reference_count = 0
wos_rec_refs = 0
# Loop thru the article collection and write the references out
for input_file in input_files:
    print(input_file)
    for article in ArticleCollection(input_file):
        article_count += 1
        for reference in article.references():
            reference_count += 1
            if reference['id'] is not None and matches_source_item(
                    reference['id']):
                wos_rec_refs += 1

                # First Hash the ID of the referenced article. This will help ensure that the output
                # files are evenly distributed in size.
                hash_id = hashlib.sha1(bytearray(reference['id'],
                                                 "utf-8")).hexdigest()
                file = get_file_bin(output_files, hash_id, output_dir, year)

                # Next write the entry to the output file corresponding to the current hash ID.
                json_str = json.dumps({
コード例 #8
0
def test_reference_list_years(single_article_path):
    expected = ['1996', '1997', '1998', '1999', '2000', '2001', '2003']
    ref_list = ArticleCollection(single_article_path).reference_list()
    assert sorted(ref_list.years()) == expected
コード例 #9
0
def test_article_parsing(articles_sample):
    assert sum(1 for article in ArticleCollection(articles_sample)) == 10
コード例 #10
0
def test_case_insensitive_phrase_matching(articles_sample, output_filepath):
    matches = ArticleCollection(articles_sample).select(
        PhraseMatcher('baker'), output_filepath)
    assert sum(1 for article in matches) == 1
コード例 #11
0
def test_citation_matching(articles_sample, output_filepath):
    ids = ["WOS:000084608600019"]
    matches = ArticleCollection(articles_sample).select(
        CitationMatcher(ids), output_filepath)
    assert sum(1 for article in matches) == 2
    assert sum(1 for line in open(output_filepath)) == 2
コード例 #12
0
start      = datetime.today()
year       = sys.argv[1]
cluster_id = sys.argv[2]
process_id = sys.argv[3]

input_files     = glob.glob("data/*.json")
article_file    = "articles.csv"


print( "Find Citing Records - Processing Data for Year:", year, cluster_id, process_id )
finder_logging.configure(cluster_id, process_id)


ids = set()
with open(article_file) as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        ids.add(row["ID"])


for input_file in input_files:
    output_file = "output/" + os.path.splitext(os.path.basename(input_file))[0] + "-citing-records.json"
    ArticleCollection(input_file).select(CitationMatcher(ids), output_file)


finish = datetime.today()
print( "Started:   ", start )
print( "Finished:  ", finish )
print( "Time Spent:", finish - start, "\n" )