Ejemplo n.º 1
0
def url2dictionary(ketabir_url: str) -> Optional[dict]:
    try:
        # Try to see if ketabir is available,
        # ottobib should continoue its work in isbn.py if it is not.
        r = request(ketabir_url)
    except RequestException:
        logger.exception(ketabir_url)
        return
    html = r.content.decode('utf-8')
    d = defaultdict(lambda: None, cite_type='book')
    d['title'] = TITLE_SEARCH(html)[1]
    # initiating name lists:
    others = []
    authors = []
    editors = []
    translators = []
    # building lists:
    for role, name in AUTHORS_FINDALL(html):
        if role == 'نويسنده':
            authors.append(first_last(name))
        elif role == 'مترجم':
            translators.append(first_last(name))
        elif role == 'ويراستار':
            editors.append(first_last(name))
        else:
            others.append(('', f'{name} ({role})'))
    if authors:
        d['authors'] = authors
    if others:
        d['others'] = others
    if editors:
        d['editors'] = editors
    if translators:
        d['translators'] = translators
    m = PUBLISHER_SEARCH(html)
    if m:
        d['publisher'] = m[1]
    m = DATE_SEARCH(html)
    if m:
        if LANG != 'fa':
            d['month'] = m['month']
            d['year'] = '۱۳' + m['year']
        else:
            d['month'] = m['month']
            d['year'] = '۱۳' + m['year']
    m = ISBN_SEARCH(html)
    if m:
        d['isbn'] = m[1]
    m = VOLUME_SEARCH(html)
    if m:
        d['volume'] = m[1]
    m = LOCATION_SEARCH(html)
    if m:
        d['publisher-location'] = m[1]
    return d
Ejemplo n.º 2
0
def url2dictionary(ketabir_url: str) -> Optional[dict]:
    try:
        # Try to see if ketabir is available,
        # ottobib should continoue its work in isbn.py if it is not.
        r = request(ketabir_url)
    except RequestException:
        logger.exception(ketabir_url)
        return
    html = r.content.decode('utf-8')
    d = defaultdict(lambda: None, cite_type='book')
    d['title'] = TITLE_SEARCH(html)[1]
    # initiating name lists:
    others = []
    authors = []
    editors = []
    translators = []
    # building lists:
    for role, name in AUTHORS_FINDALL(html):
        if role == 'نويسنده':
            authors.append(first_last(name))
        elif role == 'مترجم':
            translators.append(first_last(name))
        elif role == 'ويراستار':
            editors.append(first_last(name))
        else:
            others.append(('', name + ' (' + role + ')'))
    if authors:
        d['authors'] = authors
    if others:
        d['others'] = others
    if editors:
        d['editors'] = editors
    if translators:
        d['translators'] = translators
    m = PUBLISHER_SEARCH(html)
    if m:
        d['publisher'] = m[1]
    m = DATE_SEARCH(html)
    if m:
        if LANG != 'fa':
            d['month'] = m['month']
            d['year'] = '۱۳' + m['year']
        else:
            d['month'] = m['month']
            d['year'] = '۱۳' + m['year']
    m = ISBN_SEARCH(html)
    if m:
        d['isbn'] = m[1]
    m = VOLUME_SEARCH(html)
    if m:
        d['volume'] = m[1]
    m = LOCATION_SEARCH(html)
    if m:
        d['publisher-location'] = m[1]
    return d
Ejemplo n.º 3
0
def ris_parse(ris_text):
    """Parse RIS_text data and return the result as a dictionary."""
    d = defaultdict(lambda: None)
    match = RIS_FULLMATCH(ris_text)
    d.update(match.groupdict())
    # cite_type: (book, journal, . . . )
    cite_type = d['type'].lower()
    if cite_type == 'jour':
        t2 = d['t2']
        if t2 is not None:
            d['journal'] = t2
    url = d['url']
    if cite_type == 'elec' and url:
        d['cite_type'] = 'web'
    else:
        d['cite_type'] = cite_type
    sn = d['sn']
    if sn:  # determine if it is ISBN or ISSN according to the cite_type
        if ISBN_10OR13_SEARCH(sn):
            d['isbn'] = sn
        else:
            d['issn'] = sn
    # author:
    # d['authors'] should not be created unless there are some authors
    authors = match.captures('author')
    if authors:
        # From RIS Format Specifications:
        # Each author must be on a separate line, preceded by this tag. Each
        # reference can contain unlimited author fields, and can contain up
        # to 255 characters for each field.
        # The author name must be in the following syntax:
        # Lastname,Firstname,Suffix
        # For Firstname, you can use full names, initials, or both.
        d['authors'] = []
        for author in authors:
            try:
                author = first_last(author, separator=',')
            except InvalidNameError:
                continue
            d['authors'].append(author)
    # DOIs may be in N1 (notes) tag, search for it in any tag
    m = DOI_SEARCH(ris_text)
    if m is not None:
        d['doi'] = m[0]
    start_page = d['start_page']
    if start_page:
        end_page = d['end_page']
        if end_page:
            d['page'] = start_page + '–' + end_page
        else:
            d['page'] = start_page
    # in IRS, url can be separated using a ";"
    if url:
        d['url'] = url.partition(';')[0]
    return d
Ejemplo n.º 4
0
def parse(bibtex):
    """Parse bibtex string and return a dictionary of information."""
    bibtex = special_sequence_cleanup(bibtex)
    d = search_for_tag(bibtex)
    # cite_type: book, journal, incollection, etc.
    m = TYPE_SEARCH(bibtex)
    if m:
        d['cite_type'] = m[1].strip().lower()
    # author
    author = d['author']
    if author:
        d['authors'] = names = []
        names_append = names.append
        for author in author.split(' and '):
            if author.endswith(' and'):
                author = author[:-4]
            if not author:
                continue
            names_append(first_last(author))
        del d['author']
    # editor, not tested, just a copy of author
    editor = d['editor']
    if editor:
        d['editors'] = names = []
        names_append = names.append
        for editor in editor.split(' and '):
            if editor.endswith(' and'):
                editor = editor[:-4]
            if not editor:
                continue
            names_append(first_last(editor))
        del d['editor']
    pages = d['pages']
    if pages:
        d['page'] = \
            pages.replace(' ', '').replace('--', '–').replace('-', '–')
    return d
Ejemplo n.º 5
0
def parse(bibtex):
    """Parse bibtex string and return a dictionary of information."""
    bibtex = special_sequence_cleanup(bibtex)
    d = search_for_tag(bibtex)
    # cite_type: book, journal, incollection, etc.
    m = TYPE_SEARCH(bibtex)
    if m:
        d['cite_type'] = m[1].strip().lower()
    # author
    author = d['author']
    if author:
        d['authors'] = names = []
        names_append = names.append
        for author in author.split(' and '):
            if author.endswith(' and'):
                author = author[:-4]
            if not author:
                continue
            names_append(first_last(author))
        del d['author']
    # editor, not tested, just a copy of author
    editor = d['editor']
    if editor:
        d['editors'] = names = []
        names_append = names.append
        for editor in editor.split(' and '):
            if editor.endswith(' and'):
                editor = editor[:-4]
            if not editor:
                continue
            names_append(first_last(editor))
        del d['editor']
    pages = d['pages']
    if pages:
        d['page'] = \
            pages.replace(' ', '').replace('--', '–').replace('-', '–')
    return d
Ejemplo n.º 6
0
Archivo: ris.py Proyecto: 5j9/yadkard
def parse(ris_text):
    """Parse RIS_text data and return the result as a dictionary."""
    d = defaultdict(lambda: None)
    match = RIS_FULLMATCH(ris_text)
    d.update(match.groupdict())
    # cite_type: (book, journal, . . . )
    cite_type = d['type'].lower()
    url = d['url']
    if cite_type == 'elec' and url:
        d['cite_type'] = 'web'
    else:
        d['cite_type'] = cite_type
    # author:
    # d['authors'] should not be created unless there are some authors
    authors = match.captures('author')
    if authors:
        # From RIS Format Specifications:
        # Each author must be on a separate line, preceded by this tag. Each
        # reference can contain unlimited author fields, and can contain up
        # to 255 characters for each field.
        # The author name must be in the following syntax:
        # Lastname,Firstname,Suffix
        # For Firstname, you can use full names, initials, or both.
        d['authors'] = []
        for author in authors:
            try:
                author = first_last(author, separator=',')
            except InvalidNameError:
                continue
            d['authors'].append(author)
    # DOIs may be in N1 (notes) tag, search for it in any tag
    m = DOI_SEARCH(ris_text)
    if m:
        d['doi'] = m[0]
    start_page = d['start_page']
    if start_page:
        end_page = d['end_page']
        if end_page:
            d['page'] = start_page + '–' + end_page
        else:
            d['page'] = start_page
    # in IRS, url can be separated using a ";"
    if url:
        d['url'] = url.partition(';')[0]
    return d
Ejemplo n.º 7
0
def byline_to_names(byline) -> Optional[List[Tuple[str, str]]]:
    r"""Find authors in byline sting. Return name objects as a list.

    The "By " prefix will be omitted.
    Names will be seperated either with " and " or ", ".
    
    If any of the STOPWORDS is found in a name then it will be omitted from
    the result.

    Examples:

    >>> byline_to_names('\n By Roger Highfield, Science Editor \n')
    [RawName("Roger Highfield")]

    >>> byline_to_names(
    ...    ' By Erika Solomon in Beirut and Borzou Daragahi, '
    ...    'Middle East correspondent'
    ... )
    [RawName("Erika Solomon"), RawName("Borzou Daragahi")]
    """
    byline = byline.partition('|')[0]
    if ':' in byline or ':' in byline:
        return None
    m = ANYDATE_SEARCH(byline)
    if m:
        # Removing the date part
        byline = byline[:m.start()]
    if not byline:
        return None
    if FOUR_DIGIT_NUM(byline):
        return None
    # Normalize 'and\n' (and the similar) to standard 'and '
    # This should be done before cutting the byline at the first newline
    byline = NORMALIZE_ANDS(' and ', byline)
    byline = NORMALIZE_COMMA_SPACES(', ', byline)
    # Remove starting "by", cut at the first newline and lstrip
    byline = BY_PREFIX(r'\1', byline)
    # Removing ending " and" or ',' and rstrip
    byline = AND_OR_COMMA_SUFFIX('', byline)
    if ' and ' in byline.lower() or ' ' in byline.partition(', ')[0]:
        fullnames = AND_OR_COMMA_SPLIT(byline)
    else:
        # Comma may be the separator of first name and last name.
        fullnames = AND_SPLIT(byline)
    names = []
    for fullname in fullnames:
        fullname = fullname.partition(' in ')[0].partition(' for ')[0]
        if STOPWORDS_SEARCH(fullname):
            continue
        try:
            first, last = first_last(fullname)
        except InvalidNameError:
            continue
        if (first.startswith(('The ', 'خبرگزار')) or last.islower()):
            first, last = '', first + ' ' + last
        names.append((first, last))
    if not names:
        return None
    # Remove names not having first name (orgs)
    name0 = names[0]  # In case no name remains at the end
    names = [(fn, ln) for fn, ln in names if fn]
    if not names:
        names.append(name0)
    return names