Ejemplo n.º 1
0
    def citations(self, data):
        norm_data = []

        if 'citations' in data:
            for d in data['citations']:
                tmp_citation = Citation(d)

                cit_data = {}
                for k, v in NORM_CITATION_ATTRS.items():
                    try:
                        if v == 'first_author':
                            cit_data[k] = preprocess_default(
                                self.join_author(getattr(tmp_citation, v)))
                        elif v == 'publication_year':
                            cit_data[k] = preprocess_date(
                                tmp_citation.publication_date,
                                return_int_year=True)
                        elif v == 'index_number':
                            cit_data[k] = tmp_citation.index_number
                        else:
                            cit_data[k] = preprocess_default(
                                getattr(tmp_citation, v))
                    except AttributeError:
                        pass
                    except TypeError:
                        pass
                norm_data.append(cit_data)

        return norm_data
Ejemplo n.º 2
0
 def standardize_book(self, citation: Citation):
     return {
         'publication_type':
         'book',
         'std_authors':
         self._standardize_authors(citation.authors),
         'std_date':
         self._standardize_date(citation.publication_date),
         'std_title':
         self._standardize_title(citation.title()),
         'std_publisher':
         self._standardize_publisher(citation.publisher),
         'std_publisher_address':
         self._standardize_publisher_address(citation.publisher_address)
     }
Ejemplo n.º 3
0
def extract_cit_data(citation: Citation, cit_standardized_data=None):
    """
    Extrai os dados de uma citação.

    :param citation: Citação da qual os dados serao extraidos
    :param cit_standardized_data: Caso seja artigo, usa o padronizador de título de periódico
    :return: Dicionário composto pelos pares de nomes dos ampos limpos das citações e respectivos valores
    """
    c_attrs = {}

    c_attrs.update(_extract_cit_authors(citation))

    cleaned_publication_date = clean_publication_date(
        citation.publication_date)
    if cleaned_publication_date:
        c_attrs['cleaned_publication_date'] = cleaned_publication_date

    if citation.publication_type == 'article':
        c_attrs.update(
            _extract_cit_fields_by_list(citation,
                                        ['issue', 'start_page', 'volume']))

        cleaned_journal_title = ''
        if cit_standardized_data:
            cleaned_journal_title = cit_standardized_data[
                'official-journal-title'][0].lower()
            if cleaned_journal_title:
                c_attrs['cleaned_journal_title'] = cleaned_journal_title

        if not cleaned_journal_title:
            cleaned_journal_title = clean_journal_title(citation.source)
            if cleaned_journal_title:
                c_attrs['cleaned_journal_title'] = cleaned_journal_title

        cleaned_title = clean_field(citation.title())
        if cleaned_title:
            c_attrs['cleaned_title'] = cleaned_title

    elif citation.publication_type == 'book':
        c_attrs.update(
            _extract_cit_fields_by_list(
                citation, ['source', 'publisher', 'publisher_address']))

        cleaned_chapter_title = clean_field(citation.chapter_title)
        if cleaned_chapter_title:
            c_attrs['cleaned_chapter_title'] = cleaned_chapter_title

    return c_attrs
Ejemplo n.º 4
0
 def standardize_article(self, citation: Citation):
     return {
         'publication_type':
         'article',
         'std_journal':
         self._standardize_journal(citation),
         'std_authors':
         self._standardize_authors(citation.authors),
         'std_publication_date':
         self._standardize_date(citation.publication_date),
         'std_title':
         self._standardize_title(citation.title()),
         'std_volume':
         self._standardize_volume(citation.volume),
         'std_pages':
         self._standardize_pages(citation.start_page, citation.end_page),
         'std_issue':
         self._standardize_issue(citation.issue)
     }
Ejemplo n.º 5
0
def extract_citation_data(citation_json: str):
    cit = Citation(citation_json)

    # if the citation is not empty
    if cit.source:

        # we compare only articles
        if cit.publication_type == 'article':

            # preprocess cited journal title
            cit_title_preprocessed = StringProcessor.preprocess_journal_title(cit.source).upper()

            # update dictionary of cited titles
            update_titles(cit_title_preprocessed)

            # collect year for using in year volume base (if needed)
            cit_year = cit.publication_date
            cit_volume = cit.volume

        return cit_title_preprocessed, cit_year, cit_volume
Ejemplo n.º 6
0
 def standardize_chapter(self, citation: Citation):
     return {
         'publication_type':
         'chapter',
         'std_authors':
         self._standardize_authors(citation.authors),
         'std_title':
         self._standardize_title(citation.chapter_title),
         'std_book_authors':
         self._standardize_authors(citation.monographic_authors),
         'std_book_title':
         self._standardize_title(citation.title()),
         'std_date':
         self._standardize_date(citation.publication_date),
         'std_pages':
         self._standardize_pages(citation.start_page, citation.end_page),
         'std_publisher':
         self._standardize_publisher(citation.publisher),
         'std_publisher_address':
         self._standardize_publisher_address(citation.publisher_address)
     }
Ejemplo n.º 7
0
    results_fuzzy_todo = open(matches_folder + '/fuzzy_todo.tsv', 'w')

    # create dictionarires where the results will be added
    TITLES = {}
    TITLES_MATCHED = {}
    TITLES_UNMATCHED = {}

    # access local references' database
    refdb = MongoClient()[db_name]

    for col in refdb.list_collection_names():
        print('\nStart %s' % col)
        num_articles = 0
        num_all = 0
        for cjson in refdb[col].find({}):
            cit = Citation(cjson)
            if cit.source:
                if cit.publication_type == 'article':
                    print('\r%d' % num_articles, end='')
                    num_articles += 1

                    cit_title_preprocessed = StringProcessor.preprocess_journal_title(cit.source).upper()
                    cit_year = cit.publication_date
                    cit_volume = cit.volume

                    if cit_title_preprocessed not in TITLES:
                        TITLES[cit_title_preprocessed] = 1
                    else:
                        TITLES[cit_title_preprocessed] += 1

                    # exact match
Ejemplo n.º 8
0
                            format='%(message)s')
    else:
        print(
            'Error: please, provide the name of the local database MongoDB (e.g., ref_scielo)'
        )
        print('Error: please, provide the path of the dictionaries folder')
        sys.exit(1)

    major_dict = pickle.load(open(DICT_FOLDER + 'major_dict.dat', 'rb'))
    minor_dict = pickle.load(open(DICT_FOLDER + 'minor_dict.dat', 'rb'))

    mongo_client = MongoClient()

    ref_db = mongo_client[LOCAL_DOC_DATABASE_NAME]

    for col in ref_db.list_collection_names():
        print('matching references from collection %s' % col)
        for reference in ref_db[col].find({}):
            ref_cit = Citation(reference)
            if ref_cit.publication_type in ['article', 'conference']:
                major_key, minor_key = extract_keys(ref_cit)
                if major_key is not None and minor_key is not None:
                    if major_key in major_dict:
                        logging.info('MAJOR %s\t KEY %s\tREF %s -> %s' %
                                     (col, major_key, reference['_id'],
                                      major_dict.get(major_key)))
                    if minor_key in minor_dict:
                        logging.info('MINOR %s\t KEY %s\tREF %s -> %s' %
                                     (col, minor_key, reference['_id'],
                                      minor_dict.get(minor_key)))