def browse_aps(path_, database_file): # header = ['#','doi', 'journal', 'year', 'country'] list_ = [] fo = open(database_file, 'w') # fo.write( " ".join(header) + "\n" ) for root, dirs, files in os.walk(path_): for name in files: if name.endswith((".json")): jfile = root + "/" + name affiliations_ = get_all_affiliations(jfile) year_ = get_year_jsonfile(jfile) journal_ = get_journal_short_json(jfile) doi_ = get_doi(jfile) countries_list = [] for aff_ in affiliations_: if len(aff_) > 0: try: country_ = extract_country(aff_) if len(country_) > 0: countries_list.append(country_) except Exception, e: continue countries_list = list(set(countries_list)) if len(countries_list) > 0: for c_ in countries_list: c_parts = c_.split() record = [doi_, journal_, year_, ".".join(c_parts)] fo.write(" ".join(record) + "\n")
def browse_aps(path_, database_file): # header = ['#','doi', 'journal', 'year', 'country'] list_ = [] fo = open(database_file, 'w') # fo.write( " ".join(header) + "\n" ) for root, dirs, files in os.walk(path_): for name in files: if name.endswith(( ".json" )): jfile = root + "/" + name coauthors_number_ = len( get_coauthors_jsonfile(jfile) ) year_ = get_year_jsonfile(jfile) journal_ = get_journal_short_json(jfile) doi_ = get_doi(jfile) if coauthors_number_ > 0: record = [doi_, journal_, year_, str( coauthors_number_) ] fo.write( " ".join(record) + "\n" ) fo.close() return True
def browse_papers(path_, csv_file, fout): fo = open(fout, 'w') print("Processing citations ...") dict_1, dict_2 = parse_csv_file(csv_file) print("Processing files ...") tmp_list = [] for root, dirs, files in os.walk(path_): for name in files: if name.endswith((".json")): jfile = root + "/" + name data = json.load(open(jfile)) year, month, day = get_date_jsonfile(jfile, data) journal = get_journal_short_json(jfile, data) issue, volume = get_issue_volume(jfile, data) doi = get_doi(jfile, data) num_pages = get_number_of_pages(jfile, data) coauthors = get_coauthors_jsonfile(jfile, data) affiliations = get_all_affiliations(jfile, data) countries = get_all_countries(jfile, data) title = get_title(jfile, data) str_out = "" str_out += str(year) + " " str_out += str(month) + " " str_out += str(day) + " " str_out += str(journal) + " " str_out += str(issue) + " " str_out += str(volume) + " " str_out += str(doi) + " " str_out += str(len(coauthors)) + " " str_out += str(len(affiliations)) + " " str_out += str(len(countries)) + " " str_out += str(len(title)) + " " str_out += str(num_pages) + " " if doi in dict_1.keys(): str_out += str(len(dict_1[doi])) + " " else: str_out += str(0) + " " if doi in dict_2.keys(): str_out += str(len(dict_2[doi])) + " " else: str_out += str(0) + " " fo.write(str_out + "\n") fo.close()
def browse_papers(path_, dict_): for root, dirs, files in os.walk(path_): for name in files: if name.endswith((".json")): jfile = root + "/" + name year = get_year_jsonfile(jfile) j_id = get_journal_short_json(jfile) if j_id in dict_: if year in dict_[j_id]: dict_[j_id][year] += 1 else: dict_[j_id][year] = 1 else: dict_[j_id] = {} dict_[j_id][year] = 1 return dict_
def browse_papers(path_, csv_file, xmin=60): print("Processing citations ...") dict_1, dict_2 = parse_csv_file(csv_file) print("Processing files ...") papers_list = {} for root, dirs, files in os.walk(path_): for name in files: if name.endswith(( ".json" )): jfile = root + "/" + name data = json.load( open(jfile) ) year,month,day = get_date_jsonfile(jfile,data) journal = get_journal_short_json(jfile,data) issue,volume = get_issue_volume(jfile,data) coauthors = get_coauthors_jsonfile(jfile,data) title = get_title(jfile,data) doi_ = get_doi(jfile,data) if doi_ in dict_1.keys(): cits_ = len( dict_1[doi_] ) else: cits_ = 0 if doi_ in dict_2.keys(): refs_ = len( dict_2[doi_] ) else: refs_ = 0 if cits_ >= xmin: papers_list[doi_] = [ title.encode('utf-8'),str(journal),str(year),str(volume),str(issue),str(cits_),str(refs_) ] print("Database processed ...") return papers_list
def browse_papers(path_, csv_file): print("Processing citations ...") dict_1, dict_2 = parse_csv_file(csv_file) # client = MongoClient('localhost', 27017) client = MongoClient() db = client['apsdb'] # Get a databese aps = db['aps-articles-basic'] # Get a collection print("Removing all record ...") aps.delete_many({}) # Clean the collection print("Processing files ...") tmp_list = [] for root, dirs, files in os.walk(path_): for name in files: if name.endswith((".json")): jfile = root + "/" + name data = json.load(open(jfile)) year, month, day = get_date_jsonfile(jfile, data) journal = get_journal_short_json(jfile, data) issue, volume = get_issue_volume(jfile, data) doi = get_doi(jfile, data) num_pages = get_number_of_pages(jfile, data) coauthors = get_coauthors_jsonfile(jfile, data) affiliations = get_all_affiliations(jfile, data) countries = get_all_countries(jfile, data) title = get_title(jfile, data) aps_paper = {'year': year, 'month': month, 'day': day} aps_paper['journal'] = journal aps_paper['issue'] = issue aps_paper['volume'] = volume aps_paper['doi'] = doi aps_paper['num_authors'] = len(coauthors) aps_paper['num_affs'] = len(affiliations) aps_paper['num_countries'] = len(countries) aps_paper['title'] = title aps_paper['title_length'] = len(title) aps_paper['num_pages'] = num_pages if doi in dict_1.keys(): aps_paper['citations'] = len(dict_1[doi]) else: aps_paper['citations'] = 0 if doi in dict_2.keys(): aps_paper['num_references'] = len(dict_2[doi]) else: aps_paper['num_references'] = 0 tmp_list.append(aps_paper) if len(tmp_list) > BIG_LIST_SIZE: aps.insert_many(tmp_list) tmp_list = [] if len(tmp_list) > 0: aps.insert_many(tmp_list) tmp_list = [] return aps
def browse_papers(path_, csv_file, ofiles): BUFF_SIZE=0 of1,of2,of3,of4,of5 = ofiles f1 = open(of1, 'w', BUFF_SIZE) f2 = open(of2, 'w', BUFF_SIZE) f3 = open(of3, 'w', BUFF_SIZE) f4 = open(of4, 'w', BUFF_SIZE) f5 = open(of5, 'w', BUFF_SIZE) print("Processing citations ...") dict_1, dict_2 = parse_csv_file(csv_file) print("Processing files ...") tmp_list = [] for root, dirs, files in os.walk(path_): for name in files: if name.endswith(( ".json" )): jfile = root + "/" + name data = json.load( open(jfile) ) try: year,_,_ = get_date_jsonfile(jfile,data) journal = get_journal_short_json(jfile,data) doi = get_doi(jfile,data) title = get_clean_title(jfile,data) if doi in dict_1.keys(): cits = len(dict_1[doi]) else: cits = 0 abstract = get_clean_abstract(jfile,data) abstract = abstract.replace('\n', ' ').replace('\r', '') _=str(journal) _=str(year) _=str(cits) _=abstract.encode('utf-8') _=title.encode('utf-8') f1.write(str(journal) + "\n") f2.write(str(year) + "\n") f3.write(str(cits) + "\n") f4.write( abstract.encode('utf-8') + "\n") f5.write( title.encode('utf-8') + "\n") except KeyError as e: print 'KeyError', e pass except IOError as e: print 'KeyError', e pass except UnicodeEncodeError as e: print 'KeyError', e pass f1.close() f2.close() f3.close() f4.close() f5.close() return