コード例 #1
0
def aoi_process_contributors_bhl(*args):
    ''' Parse people name for BHL'''
    names = []
    for arg in args:
        if isinstance(arg, list):
            for name in arg:
                names.append(name)
        elif arg:
            names.append(arg)
    # Filter people names and clean dates and extra spaces.
    people = [
        re.sub(r'\d+-(\d+)?', r'', n).strip()
        for n in filter(lambda x: ', ' in x, names)
    ] or []
    # Filter institution names and clean tabs.
    inst = [
        re.sub(r'\\t', r'', n).strip()
        for n in filter(lambda x: ', ' not in x, names) or []
    ]
    # Parse names differently if they're people's or institutions' names.
    if len(people) > 0 and len(inst) > 0:
        return default_name_parser(people) + institution_name_parser(inst)
    elif len(inst) > 0:
        return institution_name_parser(inst)
    if len(people) > 0:
        return default_name_parser(people)
    else:
        return ''
コード例 #2
0
ファイル: bhl.py プロジェクト: AndrewSallans/scrapi
def process_contributors(*args):
    ''' Parse people name for BHL'''
    names = [name for sublist in args for name in sublist]
    # Filter people names and clean dates and extra spaces.
    people = [re.sub(r'\d+-(\d+)?', r'', n).strip() for n in filter(lambda x: ', ' in x, names)] or []
    # Filter institution names and clean tabs.
    inst = [re.sub(r'\\t', r'', n).strip() for n in filter(lambda x: ', ' not in x, names)] or []
    # Parse names differently if they're people's or institutions' names.
    return (default_name_parser(people) + institution_name_parser(inst)) or [{'name': ''}]
コード例 #3
0
ファイル: bhl.py プロジェクト: Johnetordoff/scrapi
def aoi_process_contributors_bhl(*args):
    ''' Parse people name for BHL'''
    names = []
    for arg in args:
        if isinstance(arg, list):
            for name in arg:
                names.append(name)
        elif arg:
            names.append(arg)
    # Filter people names and clean dates and extra spaces.
    people = [re.sub(r'\d+-(\d+)?', r'', n).strip() for n in filter(lambda x: ', ' in x, names)] or []
    # Filter institution names and clean tabs.
    inst = [re.sub(r'\\t', r'', n).strip() for n in filter(lambda x: ', ' not in x, names) or []]
    # Parse names differently if they're people's or institutions' names.
    if len(people) > 0 and len(inst) > 0:
        return default_name_parser(people) + institution_name_parser(inst)
    elif len(inst) > 0:
        return institution_name_parser(inst)
    if len(people) > 0:
        return default_name_parser(people)
    else:
        return ''
コード例 #4
0
ファイル: bhl.py プロジェクト: zamattiac/scrapi
def process_contributors(*args):
    ''' Parse people name for BHL'''
    names = [name for sublist in args for name in sublist]
    # Filter people names and clean dates and extra spaces.
    people = [
        re.sub(r'\d+-(\d+)?', r'', n).strip()
        for n in filter(lambda x: ', ' in x, names)
    ] or []
    # Filter institution names and clean tabs.
    inst = [
        re.sub(r'\\t', r'', n).strip()
        for n in filter(lambda x: ', ' not in x, names)
    ] or []
    # Parse names differently if they're people's or institutions' names.
    return (default_name_parser(people) + institution_name_parser(inst)) or [{
        'name':
        ''
    }]
コード例 #5
0
ファイル: nih.py プロジェクト: zamattiac/scrapi
def nih_name_parser(names, org_name):
    """
    Takes a list of names and organization names, and attempts to parse them
    """
    names = default_name_parser(names)
    return list(map(add_affiliation, names, org_name))
コード例 #6
0
ファイル: nih.py プロジェクト: AndrewSallans/scrapi
def nih_name_parser(names, org_name):
    """
    Takes a list of names and organization names, and attempts to parse them
    """
    names = default_name_parser(names)
    return list(map(add_affiliation, names, org_name))
コード例 #7
0
ファイル: figshare.py プロジェクト: NeuroVault/scrapi
class FigshareHarvester(JSONHarvester):
    short_name = 'figshare'
    long_name = 'figshare'
    url = 'http://figshare.com/'

    URL = 'http://api.figshare.com/v1/articles/search?search_for=*&from_date='

    schema = {
        'title': '/title',
        'description': '/description',
        'contributors': ('/authors', lambda x: default_name_parser([person['author_name'] for person in x])),
        'providerUpdatedDateTime': ('/modified_date', date_formatter),
        'uris': {
            'canonicalUri': ('/DOI', lambda x: x[0] if isinstance(x, list) else x),
            'providerUris': [
                ('/url')
            ]
        },
        'otherProperties': build_properties(
            ('serviceID', ('/article_id', str)),
            ('definedType', '/defined_type'),
            ('type', '/type'),
            ('links', '/links'),
            ('publishedDate', '/published_date')
        )
    }

    def harvest(self, start_date=None, end_date=None):
        """ Figshare should always have a 24 hour delay because they
        manually go through and check for test projects. Most of them
        are removed within 24 hours.

        So, we will shift everything back a day with harvesting to ensure
        nothing is harvested on the day of.
        """
        start_date = start_date - timedelta(1) if start_date else date.today() - timedelta(1 + settings.DAYS_BACK)
        end_date = end_date - timedelta(1) if end_date else date.today() - timedelta(1)

        search_url = '{0}{1}&to_date={2}'.format(
            self.URL,
            start_date.isoformat(),
            end_date.isoformat()
        )

        records = self.get_records(search_url)

        record_list = []
        for record in records:
            doc_id = record['article_id']

            record_list.append(
                RawDocument(
                    {
                        'doc': json.dumps(record),
                        'source': self.short_name,
                        'docID': six.text_type(doc_id),
                        'filetype': 'json'
                    }
                )
            )

        return record_list

    def get_records(self, search_url):
        records = requests.get(search_url)
        total_records = records.json()['items_found']
        page = 1

        all_records = []
        while len(all_records) < total_records:
            record_list = records.json()['items']

            for record in record_list:
                if len(all_records) < total_records:
                    all_records.append(record)

            page += 1
            records = requests.get(search_url + '&page={}'.format(str(page)), throttle=3)

        return all_records
コード例 #8
0
def process_contributors(authors):

    if authors is None:
        return []
    authors = re.split(',\s|\sand\s', authors)
    return default_name_parser(authors)
コード例 #9
0
ファイル: neurovault.py プロジェクト: AndrewSallans/scrapi
def process_contributors(authors):

    if authors is None:
        return []
    authors = re.split(',\s|\sand\s', authors)
    return default_name_parser(authors)