def get_wos_si_source_data(path_wos_si_source, ignore_extra_cols=True): file_wos_si_source = open(path_wos_si_source) line = file_wos_si_source.readline() cited_forms_with_metadata = set() while line: els = line.split('|') if len(els) != 7: print('line is invalid', line, sep="-->") else: issn = els[0] cited_form_1 = StringProcessor.preprocess_journal_title( els[1].strip()).upper() year = els[2] volume = els[3] if not ignore_extra_cols: cited_form_2 = StringProcessor.preprocess_journal_title( els[4].strip()).upper() if issn != '' and year != '' and volume != '': if cited_form_1 != '': metadata_str_1 = '|'.join( [issn, cited_form_1, year, volume, '']) cited_forms_with_metadata.add(metadata_str_1) if not ignore_extra_cols: if cited_form_2 != '': metadata_str_2 = '|'.join( [issn, cited_form_2, year, volume, '']) cited_forms_with_metadata.add(metadata_str_2) line = file_wos_si_source.readline() file_wos_si_source.close() return cited_forms_with_metadata
def extract_keys(citation: Citation): """ Extract a key from a set of citation's fields :param citation: a object of the class Citation :return: tuple of keys (represented by a comma separated string) where the first key has the first page """ if citation.first_author: first_author_first_given_name = StringProcessor.preprocess_name( citation.first_author.get('given_names', '').replace('.', ' ').split(' ')[0]).lower() if len(first_author_first_given_name) > 0: first_author_first_given_name_first_char = first_author_first_given_name[ 0].lower() else: first_author_first_given_name_first_char = '' first_author_last_surname = StringProcessor.preprocess_name( citation.first_author.get('surname', '').replace('.', ' ').replace( ';', ' ').split(' ')[-1]).lower() if first_author_last_surname == '': return None, None else: return None, None publication_date = citation.publication_date if not publication_date: publication_date = '' if citation.source: journal_title = StringProcessor.preprocess_journal_title( citation.source).lower() else: journal_title = '' issue_number = citation.issue if not issue_number: issue_number = '' issue_volume = citation.volume if not issue_volume: issue_volume = '' first_page = citation.first_page if not first_page: first_page = '' major_key = ','.join([ first_author_first_given_name_first_char, first_author_last_surname, publication_date, journal_title, issue_number, issue_volume, first_page ]) minor_key = ','.join([ first_author_first_given_name_first_char, first_author_last_surname, publication_date, journal_title, issue_number, issue_volume ]) return major_key, minor_key
def parse_html(html: str): """ Convert a content in html format to a comma-separated-value string. :param html: content in html format :return: a list of comma-separeted-value strs """ # convert html to soup souped_html = bs4.BeautifulSoup(html, features='html.parser') # get title title = set([ StringProcessor.preprocess_journal_title(t) for t in _search_attribute('title', souped_html) ]) # get title abbreviation title_abbrev = set([ StringProcessor.preprocess_journal_title(t) for t in _search_attribute('title abbreviation', souped_html) ]) # get availability availability = _search_attribute('availability', souped_html) cleaned_availability = _clean_availability(availability) # get issn issns = [ _normalize_issn(i.strip().upper()) for i in set(_search_attribute('issn', souped_html)) ] # get recent issues recent_issues = _search_attribute('recent issues', souped_html) # clean recent issues cleaned_recent_issues = _clean_recent_issues(recent_issues) # convert data to comma-string-value lines years_volumes_numbers = set(cleaned_recent_issues).union( set(cleaned_availability)) titles = title.union(title_abbrev) csv_lines = [] for t in titles: for yvn in years_volumes_numbers: for i in issns: if i != '': csv_lines.append(i + '|' + t.upper() + '|' + yvn.upper()) return sorted(set(csv_lines))
def save_char_freq(c2freq: dict): """ Save the char2freq dictionary into the disk :param c2freq: a dictionary where each key is a char and each value is composed by the preprocessed version of the char and the char's number of ocurrences """ final_c2freq = open(DEFAULT_DIR_INDEXES + '../char_freq.csv', 'w') for k in sorted(c2freq, key=lambda x: c2freq.get(x), reverse=True): final_c2freq.write('%s\t%s\t%d' % (k, StringProcessor.preprocess_journal_title(k), c2freq.get(k)) + '\n') final_c2freq.close()
def extract_citation_data(citation_json: str): cit = Citation(citation_json) # if the citation is not empty if cit.source: # we compare only articles if cit.publication_type == 'article': # preprocess cited journal title cit_title_preprocessed = StringProcessor.preprocess_journal_title(cit.source).upper() # update dictionary of cited titles update_titles(cit_title_preprocessed) # collect year for using in year volume base (if needed) cit_year = cit.publication_date cit_volume = cit.volume return cit_title_preprocessed, cit_year, cit_volume
def get_doi2cited_form_dict(path_refs_wos_doi): file_refs_wos_doi = open(path_refs_wos_doi) line = file_refs_wos_doi.readline() doi2cited_form = {} while line: rels = line.split('|') if len(rels) == 6: cited_form = StringProcessor.preprocess_journal_title( rels[2].strip()).upper() doi = rels[5].strip() if doi not in doi2cited_form: doi2cited_form[doi] = [cited_form] else: if cited_form not in doi2cited_form[doi]: doi2cited_form[doi].append(cited_form) else: print('line is invalid', line, sep='-->') try: line = file_refs_wos_doi.readline() except UnicodeDecodeError as udc: print('UnicodeDecodeError', udc, line, sep='-->') file_refs_wos_doi.close() return doi2cited_form
# access local references' database refdb = MongoClient()[db_name] for col in refdb.list_collection_names(): print('\nStart %s' % col) num_articles = 0 num_all = 0 for cjson in refdb[col].find({}): cit = Citation(cjson) if cit.source: if cit.publication_type == 'article': print('\r%d' % num_articles, end='') num_articles += 1 cit_title_preprocessed = StringProcessor.preprocess_journal_title(cit.source).upper() cit_year = cit.publication_date cit_volume = cit.volume if cit_title_preprocessed not in TITLES: TITLES[cit_title_preprocessed] = 1 else: TITLES[cit_title_preprocessed] += 1 # exact match if cit_title_preprocessed in title2issnl: res_issns = title2issnl.get(cit_title_preprocessed) res_line = [col, cit.data.get('_id'), cit_title_preprocessed, res_issns, str(len(res_issns.split('#')))] results.write('\t'.join(res_line) + '\n') res_issns_els = res_issns.split('#')
numero = '' except UnavailableMetadataException as ume: logging.error('ERROR %s' % ume) issns = set() issns.add(article.journal.electronic_issn) issns.add(article.journal.print_issn) issns.add(article.journal.scielo_issn) issns = [ i.strip().upper() for i in issns if i is not None and i.upper() not in ['', 'ISSN'] ] titles = set() titles.add( StringProcessor.preprocess_journal_title( article.journal.abbreviated_iso_title)) titles.add( StringProcessor.preprocess_journal_title( article.journal.abbreviated_title)) titles.add( StringProcessor.preprocess_journal_title( article.journal.title)) titles = [t for t in titles if t is not None and t != ''] for t in sorted(titles): for i in issns: if year != '' and volume != '': row = '|'.join([ i, t.upper(), year.strip(),
def get_doc_attrs(document): """ Returns a list of the document's attributes. It is useful for creating/updating dicionaries of metadata2pid. """ pid = document.get('_id') xydoc = Article(document) document_type = xydoc.document_type.lower() first_author = xydoc.first_author if first_author is None: first_author = {} if 'given_names' in first_author: first_author_given_names = StringProcessor.preprocess_name( first_author.get('given_names', '').lower()) else: first_author_given_names = '' if 'surname' in first_author: first_author_surname = StringProcessor.preprocess_name( first_author.get('surname', '').lower()) else: first_author_surname = '' publication_date = xydoc.document_publication_date journal_title = StringProcessor.preprocess_journal_title( xydoc.journal.title.lower()) journal_abbrev_title = StringProcessor.preprocess_journal_title( xydoc.journal.abbreviated_title.lower()) journal_issn_ppub = xydoc.journal.print_issn if journal_issn_ppub is None: journal_issn_ppub = '' journal_issn_epub = xydoc.journal.electronic_issn if journal_issn_epub is None: journal_issn_epub = '' try: issue_number = xydoc.issue.number issue_order = xydoc.issue.order issue_volume = xydoc.issue.volume except: issue_number = '' issue_order = '' issue_volume = '' if issue_number is None: issue_number = '' if issue_order is None: issue_order = '' if issue_volume is None: issue_volume = '' start_page = xydoc.start_page if xydoc.start_page is None: start_page = '' del xydoc return [ pid, document_type, first_author_given_names, first_author_surname, publication_date, journal_title, journal_abbrev_title, journal_issn_ppub, journal_issn_epub, issue_number, issue_order, issue_volume, start_page, document.get('collection') ]
def get_cited_forms_with_metadata(path_crossref, doi2cited_form: dict): file_crossref = open(path_crossref) line = file_crossref.readline() cited_forms_with_metadata = set() not_collected = 0 while line: json_line = json.loads(line) doi = json_line.get('url_searched').replace( 'https://api.crossref.org/works/', '') cited_forms = doi2cited_form.get(doi, []) if len(cited_forms) == 0: not_collected += 1 else: message = json_line.get('message', {}) if isinstance(message, dict): volume = message.get('volume', '') issue = StringProcessor.preprocess_journal_title( message.get('issue', '')).upper() print_year = str( message.get('journal-issue', {}).get('published-print', {}).get('date-parts', [['', '']])[0][0]) online_year = str( message.get('journal-issue', {}).get('published-online', {}).get('date-parts', [['', '']])[0][0]) issns = message.get('issn-type', [{}]) print_issn = [ i.get('value', '') for i in issns if i.get('type', '') == 'print' ] if len(print_issn) == 0: print_issn = '' elif len(print_issn) == 1: print_issn = print_issn[0] else: print('there are multiple online issns %s' % str(print_issn)) online_issn = [ i.get('value', '') for i in issns if i.get('type', '') == 'electronic' ] if len(online_issn) == 0: online_issn = '' elif len(online_issn) == 1: online_issn = online_issn[0] else: print('there are multiple online issns %s' % str(online_issn)) for cit in cited_forms: if print_issn != '' and cit != '' and print_year != '' and volume != '': # in some cases the volume value is composed of two numbers separated by a hyphen if '-' in volume: volume = volume.split('-')[0] metadata_print_str = '|'.join( [print_issn, cit, print_year, volume, issue]) cited_forms_with_metadata.add(metadata_print_str) if online_issn != '' and cit != '' and online_year != '' and volume != '': metadata_online_str = '|'.join( [online_issn, cit, online_year, volume, issue]) cited_forms_with_metadata.add(metadata_online_str) try: line = file_crossref.readline() except UnicodeDecodeError as udc: print('UnicodeDecodeError', udc, line, sep='-->') file_crossref.close() return cited_forms_with_metadata
def read_base(base_name: str, issn2issnl: dict, mode='create'): """ Read the attributes of a index base :param issn2issnl: a dict where each key is a issn and each value is a issn-l :param base_name: the name of the index base :param mode: the mode of exectution: create_base to create a base and (ii) count to count the number of char's ocurrences :return: a dict where each key is a issn-l and each value is a list of one list of issns and one list of titles """ dict_base = {} num_ignored_lines = 0 cols_issn = BASE2COLUMN_INDEXES.get(base_name).get('issn') cols_title = BASE2COLUMN_INDEXES.get(base_name).get('title') col_country = BASE2COLUMN_INDEXES.get(base_name).get('country') base_sep = BASE2COLUMN_INDEXES.get(base_name).get('sep') base_data = open(DEFAULT_DIR_INDEXES + base_name + '.csv') # ignore first line base_data.readline() line = base_data.readline() if mode == 'count': all_original_titles = [] print('reading base %s' % base_name) while line: i = line.split(base_sep) issns = [i[j].strip().upper() for j in cols_issn if i[j].strip() != '' and is_valid_issn(i[j].strip())] issns = list(set([x.replace('-', '') for x in issns if x != '****-****'])) if has_valid_issn(issns): if len(issns) > 0: issnl = get_issnl_from_dict(issns, issn2issnl) if issnl is not None: titles = list(set([StringProcessor.preprocess_journal_title(i[j].strip(), remove_parenthesis_info=False) for j in cols_title])) titles.extend(list(set([StringProcessor.preprocess_journal_title(i[j].strip()) for j in cols_title]))) titles = list(set([t.upper() for t in titles if is_valid_title(t)])) main_title = '' main_abbrev_title = '' if base_name == 'portal_issn': col_main_title = BASE2COLUMN_INDEXES.get(base_name).get('main_title') col_main_title_alternative = BASE2COLUMN_INDEXES.get(base_name).get('main_title_alternative') main_title = StringProcessor.preprocess_journal_title(i[col_main_title].strip()).upper() if main_title == '': main_title = StringProcessor.preprocess_journal_title(i[col_main_title_alternative].strip()).upper() col_main_abbrev_title = BASE2COLUMN_INDEXES.get(base_name).get('main_abbrev_title') main_abbrev_title = StringProcessor.preprocess_journal_title(i[col_main_abbrev_title].strip()).upper() if not DEFAULT_USE_COUNTRY_FROM_DICT: if col_country is not None: country_name = StringProcessor.preprocess_name(i[col_country].strip().upper()) if len(country_name) != 0: countries = {country_name} else: countries = set() else: countries = set() if mode == 'count': titles = list(set([i[j].strip() for j in cols_title if is_valid_title(i[j].strip())])) all_original_titles.extend(titles) if issnl != '' and len(titles) > 0: countries = set() if DEFAULT_USE_COUNTRY_FROM_DICT: countries = issnl2country.get(issnl, set()) years = issnl2years.get(issnl, set()) if issnl not in dict_base: dict_base[issnl] = [issns, [main_title], [main_abbrev_title], titles, countries, years] else: dict_base[issnl][0].extend(issns) dict_base[issnl][0] = list(set(dict_base[issnl][0])) if main_title not in dict_base[issnl][1]: dict_base[issnl][1].append(main_title) if main_abbrev_title not in dict_base[issnl][2]: dict_base[issnl][2].append(main_abbrev_title) dict_base[issnl][3].extend(titles) dict_base[issnl][3] = list(set(dict_base[issnl][3])) dict_base[issnl][4] = dict_base[issnl][4].union(countries) dict_base[issnl][5] = dict_base[issnl][5].union(years) else: num_ignored_lines += 1 else: num_ignored_lines += 1 line = base_data.readline() if mode == 'count-char': return all_original_titles print('\tlines ignored %d' % num_ignored_lines) return dict_base