Ejemplo n.º 1
0
def extract_keys(citation: Citation):
    """
    Extract a key from a set of citation's fields
    :param citation: a object of the class Citation
    :return: tuple of keys (represented by a comma separated string) where the first key has the first page
    """
    if citation.first_author:
        first_author_first_given_name = StringProcessor.preprocess_name(
            citation.first_author.get('given_names',
                                      '').replace('.',
                                                  ' ').split(' ')[0]).lower()
        if len(first_author_first_given_name) > 0:
            first_author_first_given_name_first_char = first_author_first_given_name[
                0].lower()
        else:
            first_author_first_given_name_first_char = ''
        first_author_last_surname = StringProcessor.preprocess_name(
            citation.first_author.get('surname', '').replace('.', ' ').replace(
                ';', ' ').split(' ')[-1]).lower()
        if first_author_last_surname == '':
            return None, None
    else:
        return None, None

    publication_date = citation.publication_date
    if not publication_date:
        publication_date = ''

    if citation.source:
        journal_title = StringProcessor.preprocess_journal_title(
            citation.source).lower()
    else:
        journal_title = ''

    issue_number = citation.issue
    if not issue_number:
        issue_number = ''

    issue_volume = citation.volume
    if not issue_volume:
        issue_volume = ''

    first_page = citation.first_page
    if not first_page:
        first_page = ''

    major_key = ','.join([
        first_author_first_given_name_first_char, first_author_last_surname,
        publication_date, journal_title, issue_number, issue_volume, first_page
    ])
    minor_key = ','.join([
        first_author_first_given_name_first_char, first_author_last_surname,
        publication_date, journal_title, issue_number, issue_volume
    ])

    return major_key, minor_key
Ejemplo n.º 2
0
    def get_doc_attrs(document):
        """
        Returns a list of the document's attributes.
        It is useful for creating/updating dicionaries of metadata2pid.
        """
        pid = document.get('_id')
        xydoc = Article(document)
        document_type = xydoc.document_type.lower()
        first_author = xydoc.first_author

        if first_author is None:
            first_author = {}

        if 'given_names' in first_author:
            first_author_given_names = StringProcessor.preprocess_name(
                first_author.get('given_names', '').lower())
        else:
            first_author_given_names = ''

        if 'surname' in first_author:
            first_author_surname = StringProcessor.preprocess_name(
                first_author.get('surname', '').lower())
        else:
            first_author_surname = ''

        publication_date = xydoc.document_publication_date
        journal_title = StringProcessor.preprocess_journal_title(
            xydoc.journal.title.lower())
        journal_abbrev_title = StringProcessor.preprocess_journal_title(
            xydoc.journal.abbreviated_title.lower())

        journal_issn_ppub = xydoc.journal.print_issn
        if journal_issn_ppub is None:
            journal_issn_ppub = ''

        journal_issn_epub = xydoc.journal.electronic_issn
        if journal_issn_epub is None:
            journal_issn_epub = ''

        try:
            issue_number = xydoc.issue.number
            issue_order = xydoc.issue.order
            issue_volume = xydoc.issue.volume
        except:
            issue_number = ''
            issue_order = ''
            issue_volume = ''

        if issue_number is None:
            issue_number = ''

        if issue_order is None:
            issue_order = ''

        if issue_volume is None:
            issue_volume = ''

        start_page = xydoc.start_page
        if xydoc.start_page is None:
            start_page = ''

        del xydoc

        return [
            pid, document_type, first_author_given_names, first_author_surname,
            publication_date, journal_title, journal_abbrev_title,
            journal_issn_ppub, journal_issn_epub, issue_number, issue_order,
            issue_volume, start_page,
            document.get('collection')
        ]
def read_base(base_name: str, issn2issnl: dict, mode='create'):
    """
    Read the attributes of a index base
    :param issn2issnl: a dict where each key is a issn and each value is a issn-l
    :param base_name: the name of the index base
    :param mode: the mode of exectution: create_base to create a base and (ii) count to count the number of char's ocurrences
    :return: a dict where each key is a issn-l and each value is a list of one list of issns and one list of titles
    """
    dict_base = {}
    num_ignored_lines = 0

    cols_issn = BASE2COLUMN_INDEXES.get(base_name).get('issn')
    cols_title = BASE2COLUMN_INDEXES.get(base_name).get('title')
    col_country = BASE2COLUMN_INDEXES.get(base_name).get('country')
    base_sep = BASE2COLUMN_INDEXES.get(base_name).get('sep')

    base_data = open(DEFAULT_DIR_INDEXES + base_name + '.csv')

    # ignore first line
    base_data.readline()

    line = base_data.readline()

    if mode == 'count':
        all_original_titles = []

    print('reading base %s' % base_name)

    while line:
        i = line.split(base_sep)

        issns = [i[j].strip().upper() for j in cols_issn if i[j].strip() != '' and is_valid_issn(i[j].strip())]
        issns = list(set([x.replace('-', '') for x in issns if x != '****-****']))

        if has_valid_issn(issns):
            if len(issns) > 0:
                issnl = get_issnl_from_dict(issns, issn2issnl)

                if issnl is not None:
                    titles = list(set([StringProcessor.preprocess_journal_title(i[j].strip(), remove_parenthesis_info=False) for j in cols_title]))
                    titles.extend(list(set([StringProcessor.preprocess_journal_title(i[j].strip()) for j in cols_title])))
                    titles = list(set([t.upper() for t in titles if is_valid_title(t)]))

                    main_title = ''
                    main_abbrev_title = ''

                    if base_name == 'portal_issn':
                        col_main_title = BASE2COLUMN_INDEXES.get(base_name).get('main_title')
                        col_main_title_alternative = BASE2COLUMN_INDEXES.get(base_name).get('main_title_alternative')
                        main_title = StringProcessor.preprocess_journal_title(i[col_main_title].strip()).upper()
                        if main_title == '':
                            main_title = StringProcessor.preprocess_journal_title(i[col_main_title_alternative].strip()).upper()

                        col_main_abbrev_title = BASE2COLUMN_INDEXES.get(base_name).get('main_abbrev_title')
                        main_abbrev_title = StringProcessor.preprocess_journal_title(i[col_main_abbrev_title].strip()).upper()

                    if not DEFAULT_USE_COUNTRY_FROM_DICT:
                        if col_country is not None:
                            country_name = StringProcessor.preprocess_name(i[col_country].strip().upper())
                            if len(country_name) != 0:
                                countries = {country_name}
                            else:
                                countries = set()
                        else:
                            countries = set()

                    if mode == 'count':
                        titles = list(set([i[j].strip() for j in cols_title if is_valid_title(i[j].strip())]))
                        all_original_titles.extend(titles)

                    if issnl != '' and len(titles) > 0:
                        countries = set()
                        if DEFAULT_USE_COUNTRY_FROM_DICT:
                            countries = issnl2country.get(issnl, set())

                        years = issnl2years.get(issnl, set())

                        if issnl not in dict_base:
                            dict_base[issnl] = [issns, [main_title], [main_abbrev_title], titles, countries, years]
                        else:
                            dict_base[issnl][0].extend(issns)
                            dict_base[issnl][0] = list(set(dict_base[issnl][0]))

                            if main_title not in dict_base[issnl][1]:
                                dict_base[issnl][1].append(main_title)
                            if main_abbrev_title not in dict_base[issnl][2]:
                                dict_base[issnl][2].append(main_abbrev_title)

                            dict_base[issnl][3].extend(titles)
                            dict_base[issnl][3] = list(set(dict_base[issnl][3]))
                            dict_base[issnl][4] = dict_base[issnl][4].union(countries)
                            dict_base[issnl][5] = dict_base[issnl][5].union(years)
                else:
                    num_ignored_lines += 1
        else:
            num_ignored_lines += 1

        line = base_data.readline()

    if mode == 'count-char':
        return all_original_titles

    print('\tlines ignored %d' % num_ignored_lines)
    return dict_base