def aoi_process_contributors_bhl(*args): ''' Parse people name for BHL''' names = [] for arg in args: if isinstance(arg, list): for name in arg: names.append(name) elif arg: names.append(arg) # Filter people names and clean dates and extra spaces. people = [ re.sub(r'\d+-(\d+)?', r'', n).strip() for n in filter(lambda x: ', ' in x, names) ] or [] # Filter institution names and clean tabs. inst = [ re.sub(r'\\t', r'', n).strip() for n in filter(lambda x: ', ' not in x, names) or [] ] # Parse names differently if they're people's or institutions' names. if len(people) > 0 and len(inst) > 0: return default_name_parser(people) + institution_name_parser(inst) elif len(inst) > 0: return institution_name_parser(inst) if len(people) > 0: return default_name_parser(people) else: return ''
def process_contributors(*args): ''' Parse people name for BHL''' names = [name for sublist in args for name in sublist] # Filter people names and clean dates and extra spaces. people = [re.sub(r'\d+-(\d+)?', r'', n).strip() for n in filter(lambda x: ', ' in x, names)] or [] # Filter institution names and clean tabs. inst = [re.sub(r'\\t', r'', n).strip() for n in filter(lambda x: ', ' not in x, names)] or [] # Parse names differently if they're people's or institutions' names. return (default_name_parser(people) + institution_name_parser(inst)) or [{'name': ''}]
def aoi_process_contributors_bhl(*args): ''' Parse people name for BHL''' names = [] for arg in args: if isinstance(arg, list): for name in arg: names.append(name) elif arg: names.append(arg) # Filter people names and clean dates and extra spaces. people = [re.sub(r'\d+-(\d+)?', r'', n).strip() for n in filter(lambda x: ', ' in x, names)] or [] # Filter institution names and clean tabs. inst = [re.sub(r'\\t', r'', n).strip() for n in filter(lambda x: ', ' not in x, names) or []] # Parse names differently if they're people's or institutions' names. if len(people) > 0 and len(inst) > 0: return default_name_parser(people) + institution_name_parser(inst) elif len(inst) > 0: return institution_name_parser(inst) if len(people) > 0: return default_name_parser(people) else: return ''
def process_contributors(*args): ''' Parse people name for BHL''' names = [name for sublist in args for name in sublist] # Filter people names and clean dates and extra spaces. people = [ re.sub(r'\d+-(\d+)?', r'', n).strip() for n in filter(lambda x: ', ' in x, names) ] or [] # Filter institution names and clean tabs. inst = [ re.sub(r'\\t', r'', n).strip() for n in filter(lambda x: ', ' not in x, names) ] or [] # Parse names differently if they're people's or institutions' names. return (default_name_parser(people) + institution_name_parser(inst)) or [{ 'name': '' }]
def nih_name_parser(names, org_name): """ Takes a list of names and organization names, and attempts to parse them """ names = default_name_parser(names) return list(map(add_affiliation, names, org_name))
class FigshareHarvester(JSONHarvester): short_name = 'figshare' long_name = 'figshare' url = 'http://figshare.com/' URL = 'http://api.figshare.com/v1/articles/search?search_for=*&from_date=' schema = { 'title': '/title', 'description': '/description', 'contributors': ('/authors', lambda x: default_name_parser([person['author_name'] for person in x])), 'providerUpdatedDateTime': ('/modified_date', date_formatter), 'uris': { 'canonicalUri': ('/DOI', lambda x: x[0] if isinstance(x, list) else x), 'providerUris': [ ('/url') ] }, 'otherProperties': build_properties( ('serviceID', ('/article_id', str)), ('definedType', '/defined_type'), ('type', '/type'), ('links', '/links'), ('publishedDate', '/published_date') ) } def harvest(self, start_date=None, end_date=None): """ Figshare should always have a 24 hour delay because they manually go through and check for test projects. Most of them are removed within 24 hours. So, we will shift everything back a day with harvesting to ensure nothing is harvested on the day of. """ start_date = start_date - timedelta(1) if start_date else date.today() - timedelta(1 + settings.DAYS_BACK) end_date = end_date - timedelta(1) if end_date else date.today() - timedelta(1) search_url = '{0}{1}&to_date={2}'.format( self.URL, start_date.isoformat(), end_date.isoformat() ) records = self.get_records(search_url) record_list = [] for record in records: doc_id = record['article_id'] record_list.append( RawDocument( { 'doc': json.dumps(record), 'source': self.short_name, 'docID': six.text_type(doc_id), 'filetype': 'json' } ) ) return record_list def get_records(self, search_url): records = requests.get(search_url) total_records = records.json()['items_found'] page = 1 all_records = [] while len(all_records) < total_records: record_list = records.json()['items'] for record in record_list: if len(all_records) < total_records: all_records.append(record) page += 1 records = requests.get(search_url + '&page={}'.format(str(page)), throttle=3) return all_records
def process_contributors(authors): if authors is None: return [] authors = re.split(',\s|\sand\s', authors) return default_name_parser(authors)