def citations(self, data): norm_data = [] if 'citations' in data: for d in data['citations']: tmp_citation = Citation(d) cit_data = {} for k, v in NORM_CITATION_ATTRS.items(): try: if v == 'first_author': cit_data[k] = preprocess_default( self.join_author(getattr(tmp_citation, v))) elif v == 'publication_year': cit_data[k] = preprocess_date( tmp_citation.publication_date, return_int_year=True) elif v == 'index_number': cit_data[k] = tmp_citation.index_number else: cit_data[k] = preprocess_default( getattr(tmp_citation, v)) except AttributeError: pass except TypeError: pass norm_data.append(cit_data) return norm_data
def standardize_book(self, citation: Citation): return { 'publication_type': 'book', 'std_authors': self._standardize_authors(citation.authors), 'std_date': self._standardize_date(citation.publication_date), 'std_title': self._standardize_title(citation.title()), 'std_publisher': self._standardize_publisher(citation.publisher), 'std_publisher_address': self._standardize_publisher_address(citation.publisher_address) }
def extract_cit_data(citation: Citation, cit_standardized_data=None): """ Extrai os dados de uma citação. :param citation: Citação da qual os dados serao extraidos :param cit_standardized_data: Caso seja artigo, usa o padronizador de título de periódico :return: Dicionário composto pelos pares de nomes dos ampos limpos das citações e respectivos valores """ c_attrs = {} c_attrs.update(_extract_cit_authors(citation)) cleaned_publication_date = clean_publication_date( citation.publication_date) if cleaned_publication_date: c_attrs['cleaned_publication_date'] = cleaned_publication_date if citation.publication_type == 'article': c_attrs.update( _extract_cit_fields_by_list(citation, ['issue', 'start_page', 'volume'])) cleaned_journal_title = '' if cit_standardized_data: cleaned_journal_title = cit_standardized_data[ 'official-journal-title'][0].lower() if cleaned_journal_title: c_attrs['cleaned_journal_title'] = cleaned_journal_title if not cleaned_journal_title: cleaned_journal_title = clean_journal_title(citation.source) if cleaned_journal_title: c_attrs['cleaned_journal_title'] = cleaned_journal_title cleaned_title = clean_field(citation.title()) if cleaned_title: c_attrs['cleaned_title'] = cleaned_title elif citation.publication_type == 'book': c_attrs.update( _extract_cit_fields_by_list( citation, ['source', 'publisher', 'publisher_address'])) cleaned_chapter_title = clean_field(citation.chapter_title) if cleaned_chapter_title: c_attrs['cleaned_chapter_title'] = cleaned_chapter_title return c_attrs
def standardize_article(self, citation: Citation): return { 'publication_type': 'article', 'std_journal': self._standardize_journal(citation), 'std_authors': self._standardize_authors(citation.authors), 'std_publication_date': self._standardize_date(citation.publication_date), 'std_title': self._standardize_title(citation.title()), 'std_volume': self._standardize_volume(citation.volume), 'std_pages': self._standardize_pages(citation.start_page, citation.end_page), 'std_issue': self._standardize_issue(citation.issue) }
def extract_citation_data(citation_json: str): cit = Citation(citation_json) # if the citation is not empty if cit.source: # we compare only articles if cit.publication_type == 'article': # preprocess cited journal title cit_title_preprocessed = StringProcessor.preprocess_journal_title(cit.source).upper() # update dictionary of cited titles update_titles(cit_title_preprocessed) # collect year for using in year volume base (if needed) cit_year = cit.publication_date cit_volume = cit.volume return cit_title_preprocessed, cit_year, cit_volume
def standardize_chapter(self, citation: Citation): return { 'publication_type': 'chapter', 'std_authors': self._standardize_authors(citation.authors), 'std_title': self._standardize_title(citation.chapter_title), 'std_book_authors': self._standardize_authors(citation.monographic_authors), 'std_book_title': self._standardize_title(citation.title()), 'std_date': self._standardize_date(citation.publication_date), 'std_pages': self._standardize_pages(citation.start_page, citation.end_page), 'std_publisher': self._standardize_publisher(citation.publisher), 'std_publisher_address': self._standardize_publisher_address(citation.publisher_address) }
results_fuzzy_todo = open(matches_folder + '/fuzzy_todo.tsv', 'w') # create dictionarires where the results will be added TITLES = {} TITLES_MATCHED = {} TITLES_UNMATCHED = {} # access local references' database refdb = MongoClient()[db_name] for col in refdb.list_collection_names(): print('\nStart %s' % col) num_articles = 0 num_all = 0 for cjson in refdb[col].find({}): cit = Citation(cjson) if cit.source: if cit.publication_type == 'article': print('\r%d' % num_articles, end='') num_articles += 1 cit_title_preprocessed = StringProcessor.preprocess_journal_title(cit.source).upper() cit_year = cit.publication_date cit_volume = cit.volume if cit_title_preprocessed not in TITLES: TITLES[cit_title_preprocessed] = 1 else: TITLES[cit_title_preprocessed] += 1 # exact match
format='%(message)s') else: print( 'Error: please, provide the name of the local database MongoDB (e.g., ref_scielo)' ) print('Error: please, provide the path of the dictionaries folder') sys.exit(1) major_dict = pickle.load(open(DICT_FOLDER + 'major_dict.dat', 'rb')) minor_dict = pickle.load(open(DICT_FOLDER + 'minor_dict.dat', 'rb')) mongo_client = MongoClient() ref_db = mongo_client[LOCAL_DOC_DATABASE_NAME] for col in ref_db.list_collection_names(): print('matching references from collection %s' % col) for reference in ref_db[col].find({}): ref_cit = Citation(reference) if ref_cit.publication_type in ['article', 'conference']: major_key, minor_key = extract_keys(ref_cit) if major_key is not None and minor_key is not None: if major_key in major_dict: logging.info('MAJOR %s\t KEY %s\tREF %s -> %s' % (col, major_key, reference['_id'], major_dict.get(major_key))) if minor_key in minor_dict: logging.info('MINOR %s\t KEY %s\tREF %s -> %s' % (col, minor_key, reference['_id'], minor_dict.get(minor_key)))