def url2dictionary(ketabir_url: str) -> Optional[dict]: try: # Try to see if ketabir is available, # ottobib should continoue its work in isbn.py if it is not. r = request(ketabir_url) except RequestException: logger.exception(ketabir_url) return html = r.content.decode('utf-8') d = defaultdict(lambda: None, cite_type='book') d['title'] = TITLE_SEARCH(html)[1] # initiating name lists: others = [] authors = [] editors = [] translators = [] # building lists: for role, name in AUTHORS_FINDALL(html): if role == 'نويسنده': authors.append(first_last(name)) elif role == 'مترجم': translators.append(first_last(name)) elif role == 'ويراستار': editors.append(first_last(name)) else: others.append(('', f'{name} ({role})')) if authors: d['authors'] = authors if others: d['others'] = others if editors: d['editors'] = editors if translators: d['translators'] = translators m = PUBLISHER_SEARCH(html) if m: d['publisher'] = m[1] m = DATE_SEARCH(html) if m: if LANG != 'fa': d['month'] = m['month'] d['year'] = '۱۳' + m['year'] else: d['month'] = m['month'] d['year'] = '۱۳' + m['year'] m = ISBN_SEARCH(html) if m: d['isbn'] = m[1] m = VOLUME_SEARCH(html) if m: d['volume'] = m[1] m = LOCATION_SEARCH(html) if m: d['publisher-location'] = m[1] return d
def url2dictionary(ketabir_url: str) -> Optional[dict]: try: # Try to see if ketabir is available, # ottobib should continoue its work in isbn.py if it is not. r = request(ketabir_url) except RequestException: logger.exception(ketabir_url) return html = r.content.decode('utf-8') d = defaultdict(lambda: None, cite_type='book') d['title'] = TITLE_SEARCH(html)[1] # initiating name lists: others = [] authors = [] editors = [] translators = [] # building lists: for role, name in AUTHORS_FINDALL(html): if role == 'نويسنده': authors.append(first_last(name)) elif role == 'مترجم': translators.append(first_last(name)) elif role == 'ويراستار': editors.append(first_last(name)) else: others.append(('', name + ' (' + role + ')')) if authors: d['authors'] = authors if others: d['others'] = others if editors: d['editors'] = editors if translators: d['translators'] = translators m = PUBLISHER_SEARCH(html) if m: d['publisher'] = m[1] m = DATE_SEARCH(html) if m: if LANG != 'fa': d['month'] = m['month'] d['year'] = '۱۳' + m['year'] else: d['month'] = m['month'] d['year'] = '۱۳' + m['year'] m = ISBN_SEARCH(html) if m: d['isbn'] = m[1] m = VOLUME_SEARCH(html) if m: d['volume'] = m[1] m = LOCATION_SEARCH(html) if m: d['publisher-location'] = m[1] return d
def ris_parse(ris_text): """Parse RIS_text data and return the result as a dictionary.""" d = defaultdict(lambda: None) match = RIS_FULLMATCH(ris_text) d.update(match.groupdict()) # cite_type: (book, journal, . . . ) cite_type = d['type'].lower() if cite_type == 'jour': t2 = d['t2'] if t2 is not None: d['journal'] = t2 url = d['url'] if cite_type == 'elec' and url: d['cite_type'] = 'web' else: d['cite_type'] = cite_type sn = d['sn'] if sn: # determine if it is ISBN or ISSN according to the cite_type if ISBN_10OR13_SEARCH(sn): d['isbn'] = sn else: d['issn'] = sn # author: # d['authors'] should not be created unless there are some authors authors = match.captures('author') if authors: # From RIS Format Specifications: # Each author must be on a separate line, preceded by this tag. Each # reference can contain unlimited author fields, and can contain up # to 255 characters for each field. # The author name must be in the following syntax: # Lastname,Firstname,Suffix # For Firstname, you can use full names, initials, or both. d['authors'] = [] for author in authors: try: author = first_last(author, separator=',') except InvalidNameError: continue d['authors'].append(author) # DOIs may be in N1 (notes) tag, search for it in any tag m = DOI_SEARCH(ris_text) if m is not None: d['doi'] = m[0] start_page = d['start_page'] if start_page: end_page = d['end_page'] if end_page: d['page'] = start_page + '–' + end_page else: d['page'] = start_page # in IRS, url can be separated using a ";" if url: d['url'] = url.partition(';')[0] return d
def parse(bibtex): """Parse bibtex string and return a dictionary of information.""" bibtex = special_sequence_cleanup(bibtex) d = search_for_tag(bibtex) # cite_type: book, journal, incollection, etc. m = TYPE_SEARCH(bibtex) if m: d['cite_type'] = m[1].strip().lower() # author author = d['author'] if author: d['authors'] = names = [] names_append = names.append for author in author.split(' and '): if author.endswith(' and'): author = author[:-4] if not author: continue names_append(first_last(author)) del d['author'] # editor, not tested, just a copy of author editor = d['editor'] if editor: d['editors'] = names = [] names_append = names.append for editor in editor.split(' and '): if editor.endswith(' and'): editor = editor[:-4] if not editor: continue names_append(first_last(editor)) del d['editor'] pages = d['pages'] if pages: d['page'] = \ pages.replace(' ', '').replace('--', '–').replace('-', '–') return d
def parse(ris_text): """Parse RIS_text data and return the result as a dictionary.""" d = defaultdict(lambda: None) match = RIS_FULLMATCH(ris_text) d.update(match.groupdict()) # cite_type: (book, journal, . . . ) cite_type = d['type'].lower() url = d['url'] if cite_type == 'elec' and url: d['cite_type'] = 'web' else: d['cite_type'] = cite_type # author: # d['authors'] should not be created unless there are some authors authors = match.captures('author') if authors: # From RIS Format Specifications: # Each author must be on a separate line, preceded by this tag. Each # reference can contain unlimited author fields, and can contain up # to 255 characters for each field. # The author name must be in the following syntax: # Lastname,Firstname,Suffix # For Firstname, you can use full names, initials, or both. d['authors'] = [] for author in authors: try: author = first_last(author, separator=',') except InvalidNameError: continue d['authors'].append(author) # DOIs may be in N1 (notes) tag, search for it in any tag m = DOI_SEARCH(ris_text) if m: d['doi'] = m[0] start_page = d['start_page'] if start_page: end_page = d['end_page'] if end_page: d['page'] = start_page + '–' + end_page else: d['page'] = start_page # in IRS, url can be separated using a ";" if url: d['url'] = url.partition(';')[0] return d
def byline_to_names(byline) -> Optional[List[Tuple[str, str]]]: r"""Find authors in byline sting. Return name objects as a list. The "By " prefix will be omitted. Names will be seperated either with " and " or ", ". If any of the STOPWORDS is found in a name then it will be omitted from the result. Examples: >>> byline_to_names('\n By Roger Highfield, Science Editor \n') [RawName("Roger Highfield")] >>> byline_to_names( ... ' By Erika Solomon in Beirut and Borzou Daragahi, ' ... 'Middle East correspondent' ... ) [RawName("Erika Solomon"), RawName("Borzou Daragahi")] """ byline = byline.partition('|')[0] if ':' in byline or ':' in byline: return None m = ANYDATE_SEARCH(byline) if m: # Removing the date part byline = byline[:m.start()] if not byline: return None if FOUR_DIGIT_NUM(byline): return None # Normalize 'and\n' (and the similar) to standard 'and ' # This should be done before cutting the byline at the first newline byline = NORMALIZE_ANDS(' and ', byline) byline = NORMALIZE_COMMA_SPACES(', ', byline) # Remove starting "by", cut at the first newline and lstrip byline = BY_PREFIX(r'\1', byline) # Removing ending " and" or ',' and rstrip byline = AND_OR_COMMA_SUFFIX('', byline) if ' and ' in byline.lower() or ' ' in byline.partition(', ')[0]: fullnames = AND_OR_COMMA_SPLIT(byline) else: # Comma may be the separator of first name and last name. fullnames = AND_SPLIT(byline) names = [] for fullname in fullnames: fullname = fullname.partition(' in ')[0].partition(' for ')[0] if STOPWORDS_SEARCH(fullname): continue try: first, last = first_last(fullname) except InvalidNameError: continue if (first.startswith(('The ', 'خبرگزار')) or last.islower()): first, last = '', first + ' ' + last names.append((first, last)) if not names: return None # Remove names not having first name (orgs) name0 = names[0] # In case no name remains at the end names = [(fn, ln) for fn, ln in names if fn] if not names: names.append(name0) return names