Python SourceParserの例

プログラミング言語: Python

名前空間/パッケージ名: generic_parser

メソッド/関数: SourceParser

hotexamples.comのコード掲載数: 10

Python SourceParser - 10件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのgeneric_parser.SourceParserの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: springer_compact_parser.py プロジェクト: sejoska/orpheus

def main():
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    source_parser = generic_parser.SourceParser('Springer Journals List 2019',
                                                url='https://resource-cms.springernature.com/springer-cms/rest/v1/content/829308/data/v3')
    SOURCE_ID = source_parser.match_or_create_source()

    p_parser = generic_parser.NodeParser(name='Springer', type='PUBLISHER')
    p_node_id, p_node_record, p_match_type = p_parser.match2node()
    p_parser.node_id, p_parser.node_record = generic_parser.act_on_orpheus_match(p_parser, p_node_id, p_node_record,
                                                                                 p_match_type)

    inputfile = os.path.join(BASE_DIR, 'datasets', 'tabula-Eligible Open Choice Journals CCBY.csv')

    with open(inputfile, encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        row_counter = 0
        for row in reader:
            row_counter += 1
            issn = row['ISSN print'].strip()
            eissn = row['ISSN electronic'].strip()
            jname = row['Title'].replace('\n', ' ').replace('  ', ' ').strip()
            oastatus = row['Open Access'].strip()

            logger.info('-------({}) Working on {}; oastatus: {}; '
                         'issn: {}; eissn: {}'.format(row_counter, jname, oastatus, issn, eissn))
            j_parser = generic_parser.NodeParser(name=jname, issn=issn, eissn=eissn, publisher='Springer',
                                                 source=SOURCE_ID, publisher_node_id=p_parser.node_id)
            j_parser.oa_status = oastatus.replace('Fully Open Access',
                                                  'FULLY_OA').replace('Hybrid (Open Choice)',
                                                                      'HYBRID').upper()

            # Attempt to find a match in Orpheus
            node_id, node_record, match_type = j_parser.match2node()
            j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match(j_parser, node_id,
                                                                                         node_record, match_type)
            logger.debug('j_parser.node_record: {}'.format(j_parser.node_record))
            #determine Orpheus id of preferred name
            if j_parser.node_record['name_status'] not in ['PRIMARY']:
                preferred_node_id = j_parser.node_record['synonym_of']
            else:
                preferred_node_id = j_parser.node_id

            logger.debug('preferred_node_id: {}'.format(preferred_node_id))
            logger.debug('j_parser: {}'.format(vars(j_parser)))

            # Attach policies to preferred name node
            logger.debug('Calling j_parser.PolicyMatcher(j_parser, policy_type=oa_status).match()')
            j_parser.PolicyMatcher(j_parser, policy_type='oa_status').match()
            logger.debug('Calling j_parser.PolicyMatcher(j_parser, policy_type=deal).match()')
            j_parser.PolicyMatcher(j_parser, policy_type='deal').match(supersede_existing=False,
                                                                       **{'applies_to': 'INSTITUTIONS',
                                                                          'type': 'SPRINGER COMPACT'})

コード例 #2

ファイルを表示

ファイル: wos_parser.py プロジェクト: sejoska/orpheus

def main():
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    input_folder = os.path.join(BASE_DIR, 'datasets', 'clarivate_analytics')
    source_parser = generic_parser.SourceParser(
        'Clarivate Analytics Master Journal List',
        url='http://mjl.clarivate.com/#journal_lists')
    WOS_SOURCE_ID = source_parser.match_or_create_source()

    journal_counter = 0
    for f in os.listdir(input_folder):
        if f.endswith('.csv'):
            logger.info('------------- Working on file {}'.format(f))
            with open(os.path.join(input_folder, f),
                      encoding='utf-8') as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    journal_counter += 1
                    jname = row['Journal Title'].strip().replace('\n',
                                                                 '').title()
                    publisher = row['Publisher'].strip().replace('\n',
                                                                 '').title()
                    issn = row['ISSN'].strip()
                    eissn = row['E-ISSN'].strip()
                    if jname == 'Journal Title':
                        pass
                    else:
                        logger.info(
                            '-------------- ({}) Working on journal: {}'.
                            format(journal_counter, jname))

                        j_parser = generic_parser.NodeParser(
                            name=jname,
                            issn=issn,
                            eissn=eissn,
                            publisher=publisher,
                            source=WOS_SOURCE_ID)

                        j_parser.match2romeo_publisher()

                        # Attempt to find a match in Orpheus
                        node_id, node_record, match_type = j_parser.match2node(
                        )
                        j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match(
                            j_parser, node_id, node_record, match_type)

コード例 #3

ファイルを表示

ファイル: europepmc_journal_list_parser.py プロジェクト: sejoska/orpheus

def main():

    epmc_embargo2months = {
        "": None,
        "Immediate": 0,
        "0 months or more": 0,
        "1 month": 1,
        "2 months": 2,
        "2 months or less": 2,
        "3 months": 3,
        "3 months or more": 3,
        "6 months": 6,
        "6 months or less": 6,
        "12 months": 12,
        "12 months or less": 12,
        "24 months": 24,
        "24 months or less": 24,
        "36 months": 36,
        "36 months or less": 36,
    }

    participation_level2orpheus = {" Full ": "FULL", " NIH Portfolio ": "NIH"}

    open_licence2orpheus = {"All": "ALL", "No": "NO", "Some": "SOME"}

    deposit_status2orpheus = {
        " ": None,
        " No New Content ": "NO_NEW",
        " Now Select ": "NOW_SELECT",
        " Predecessor ": "PRE"
    }

    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    source_parser = generic_parser.SourceParser(
        'PMC Journal List', url='https://europepmc.org/journalList')
    SOURCE_ID = source_parser.match_or_create_source()

    inputfile = os.path.join(BASE_DIR, 'datasets', 'jlist.csv')

    with open(inputfile, encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        row_counter = 0
        for row in reader:
            row_counter += 1
            # if row_counter < 1439: # Don't forget to remove this after testing
            #     continue
            issn = row['pISSN'].strip()
            eissn = row['eISSN'].strip()
            jname = row['Journal title'].replace('\n',
                                                 ' ').replace('  ',
                                                              ' ').strip()
            jsynonym = row['NLM TA'].replace('\n', ' ').replace('  ',
                                                                ' ').strip()
            jpublisher = row['Publisher'].replace('\n',
                                                  ' ').replace('  ',
                                                               ' ').strip()
            embargo = row['Free access']
            open_licence = row['Open access']
            participation_level = row['Participation level']
            deposit_status = row[' Deposit status']
            epmc_url = row[' Journal URL']

            logger.info('-------({}) Working on {}; '
                        'issn: {}; eissn: {}'.format(row_counter, jname, issn,
                                                     eissn))

            if deposit_status == " Predecessor ":
                logger.info(
                    'Skipped {} (title no longer in publication)'.format(
                        jname))
                continue

            logger.info('--- Parsing publisher info')
            p_parser = generic_parser.NodeParser(name=jpublisher,
                                                 type='PUBLISHER')
            p_node_id, p_node_record, p_match_type = p_parser.match2node()
            p_parser.node_id, p_parser.node_record = generic_parser.act_on_orpheus_match(
                p_parser, p_node_id, p_node_record, p_match_type)
            if p_parser.node_record['name_status'] not in ['PRIMARY']:
                p_preferred_node_id = p_parser.node_record['synonym_of']
            else:
                p_preferred_node_id = p_parser.node_id

            logger.info('--- Parsing journal info')
            j_parser = generic_parser.NodeParser(
                name=jname,
                issn=issn,
                eissn=eissn,
                publisher=jpublisher,
                source=SOURCE_ID,
                publisher_node_id=p_preferred_node_id,
                epmc_url=epmc_url)
            # Attempt to find a match in Orpheus
            node_id, node_record, match_type = j_parser.match2node()

            j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match(
                j_parser,
                node_id,
                node_record,
                match_type,
                prompt_responses=PROMPT_RESPONSES)
            logger.debug('j_parser.node_record: {}'.format(
                j_parser.node_record))
            #determine Orpheus id of preferred name
            if j_parser.node_record['name_status'] not in ['PRIMARY']:
                preferred_node_id = j_parser.node_record['synonym_of']
            else:
                preferred_node_id = j_parser.node_id

            # Parsing abbreviation as a synonym
            if jsynonym:
                logger.debug(
                    '--- Processing alternative title "{}"'.format(jsynonym))
                syn_parser = generic_parser.NodeParser(
                    name=jsynonym,
                    issn=issn,
                    eissn=eissn,
                    publisher=j_parser.publisher,
                    publisher_node_record=j_parser.publisher_node_record,
                    publisher_node_id=j_parser.publisher_node_id,
                    source=SOURCE_ID)
                syn_parser.name_status = 'SYNONYM'
                syn_parser.synonym_of = preferred_node_id
                syn_id, syn_record, match_type = syn_parser.match2node()
                generic_parser.act_on_orpheus_match(syn_parser, syn_id,
                                                    syn_record, match_type)

            logger.debug('preferred_node_id: {}'.format(preferred_node_id))
            logger.debug('j_parser: {}'.format(vars(j_parser)))

            # Attach policies to preferred name node
            logger.debug(
                '--- Calling j_parser.PolicyMatcher(j_parser, policy_type=epmc).match()'
            )
            j_parser.PolicyMatcher(j_parser, policy_type='epmc').match(
                supersede_existing=False,
                **{
                    'participation_level':
                    participation_level2orpheus[participation_level],
                    'embargo_months':
                    epmc_embargo2months[embargo],
                    'open_licence':
                    open_licence2orpheus[open_licence],
                    'deposit_status':
                    deposit_status2orpheus[deposit_status],
                })

コード例 #4

ファイルを表示

ファイル: orpheus_constants.py プロジェクト: sejoska/orpheus

import generic_parser

# Common sources
doaj_source_parser = generic_parser.SourceParser('DOAJ',
                                                 url='https://doaj.org/')
DOAJ_SOURCE_ID = doaj_source_parser.match_or_create_source()

# Licence ids in Orpheus
cc0_parser = generic_parser.LicenceParser(
    short_name='CC0',
    long_name='Public domain',
    url='https://creativecommons.org/publicdomain/zero/1.0/')
ccbynd_parser = generic_parser.LicenceParser(
    short_name='CC BY-ND',
    long_name='Creative Commons Attribution-NoDerivatives',
    url='https://creativecommons.org/licenses/by-nd/4.0/')
ccbysa_parser = generic_parser.LicenceParser(
    short_name='CC BY-SA',
    long_name='Creative Commons Attribution-ShareAlike',
    url='https://creativecommons.org/licenses/by-sa/4.0/')
ccby_parser = generic_parser.LicenceParser(short_name='CC BY')
ccbync_parser = generic_parser.LicenceParser(short_name='CC BY-NC')
ccbyncnd_parser = generic_parser.LicenceParser(short_name='CC BY-NC-ND')
ccbyncsa_parser = generic_parser.LicenceParser(short_name='CC BY-NC-SA')
custom_parser = generic_parser.LicenceParser(short_name='Custom')
CC0_ID = cc0_parser.match_or_create_licence()
CCBYND_ID = ccbynd_parser.match_or_create_licence()
CCBYSA_ID = ccbysa_parser.match_or_create_licence()
CCBY_ID = ccby_parser.match_licence()
CCBYNC_ID = ccbync_parser.match_licence()
CCBYNCND_ID = ccbyncnd_parser.match_licence()

コード例 #5

ファイルを表示

ファイル: oup_parser.py プロジェクト: sejoska/orpheus

def main():
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    oup_url = 'https://academic.oup.com/journals/pages/access_purchase/rights_and_permissions/embargo_periods'
    source_parser = generic_parser.SourceParser(
        'OUP website: Accepted Manuscript Embargo Periods', url=oup_url)
    SOURCE_ID = source_parser.match_or_create_source()

    website_parser = generic_parser.SourceParser(
        'OUP website: Author self-archiving policy',
        url=
        'https://academic.oup.com/journals/pages/access_purchase/rights_and_permissions/'
        'author_self_archiving_policy')
    OUP_WEBSITE_ID = website_parser.match_or_create_source()

    p_parser = generic_parser.NodeParser(name='Oxford University Press',
                                         type='PUBLISHER',
                                         source=SOURCE_ID)
    p_node_id, p_node_record, p_match_type = p_parser.match2node()
    p_parser.node_id, p_parser.node_record = generic_parser.act_on_orpheus_match(
        p_parser, p_node_id, p_node_record, p_match_type)

    r = urllib.request.urlopen(oup_url).read()
    s = BeautifulSoup(r, 'html.parser')

    journal_counter = 0
    embargo_data_values = []
    for tr in s.find_all('tr'):
        j_parser = generic_parser.NodeParser(
            publisher='Oxford University Press',
            source=SOURCE_ID,
            publisher_node_id=p_parser.node_id)
        preferred_node_id = None
        td_counter = 0
        for td in tr.find_all('td'):
            td_counter += 1
            if td.string:
                jname = td.string.replace('\n', '').strip()
            if td.a:
                if ('\n' in td.a.string) or ('Custom' in td.a.string):
                    embargo_data = ' '.join(
                        td.a.string.replace(
                            'months',
                            '').split())  #https://stackoverflow.com/a/1546251
                else:
                    jname = td.a.string.replace('\n', '').strip()

            if td_counter % 2 == 0:  #Even (This td contains AM embargo_data)
                if embargo_data in ['Full Open Access', 'Fully Open Access']:
                    j_parser.oa_status = 'FULLY_OA'
                    logger.debug(
                        '{} is an open access journal. Calling j_parser.PolicyMatcher(j_parser, '
                        'policy_type=oa_status).match()'.format(j_parser.name))
                    j_parser.PolicyMatcher(j_parser,
                                           policy_type='oa_status').match()
                elif embargo_data in 'Custom':
                    logger.debug(
                        'Skipping custom AM self-archiving policy ({})'.format(
                            j_parser.name))
                else:
                    if not preferred_node_id:
                        sys.exit('preferred_node_id not set for {}'.format(
                            j_parser.name))
                    am_policy = generic_parser.GreenPolicyInstance()
                    am_policy.node = preferred_node_id
                    am_policy.outlet = [INST_REPO_ID, SUBJ_REPO_ID, PUBMED_ID]
                    am_policy.version = [AM_ID]
                    am_policy.version_embargo_months = int(embargo_data)
                    am_policy.version_green_licence = CUSTOM_ID
                    am_policy.source = SOURCE_ID
                    am_policy.verbatim = am_verbatim
                    logger.debug(
                        'Calling j_parser.server_data_match_green_policy(**am_policy.as_dict())'
                    )
                    j_parser.PolicyMatcher(
                        j_parser,
                        policy_type='green').match(**am_policy.as_dict())

                preprint_policy = generic_parser.GreenPolicyInstance()
                preprint_policy.node = preferred_node_id
                preprint_policy.outlet = ALL_OUTLETS
                preprint_policy.version = [PREPRINT_ID]
                preprint_policy.version_embargo_months = 0
                preprint_policy.version_green_licence = CUSTOM_ID
                preprint_policy.source = OUP_WEBSITE_ID
                preprint_policy.verbatim = preprint_verbatim
                logger.debug(
                    'Calling j_parser.server_data_match_green_policy(**preprint_policy.as_dict())'
                )
                j_parser.PolicyMatcher(
                    j_parser,
                    policy_type='green').match(**preprint_policy.as_dict())

                website_policy = generic_parser.GreenPolicyInstance()
                website_policy.node = preferred_node_id
                website_policy.outlet = [WEBSITE_ID]
                website_policy.version = [AM_ID]
                website_policy.version_embargo_months = 0
                website_policy.version_green_licence = CUSTOM_ID
                website_policy.source = OUP_WEBSITE_ID
                website_policy.verbatim = 'Authors may make their AM available on their non-commercial homepage or blog. They may also privately share their work within their institution for the purposes of research or education, and make copies available to colleagues or students for their personal use providing that the AM is not made publicly available until after the embargo period.'
                logger.debug(
                    'Calling j_parser.server_data_match_green_policy(**website_policy.as_dict())'
                )
                j_parser.PolicyMatcher(
                    j_parser,
                    policy_type='green').match(**website_policy.as_dict())

                if j_parser.oa_status != 'FULLY_OA':
                    vor_policy = generic_parser.GreenPolicyInstance()
                    vor_policy.node = preferred_node_id
                    vor_policy.outlet = ALL_OUTLETS
                    vor_policy.deposit_allowed = False
                    vor_policy.version = [VOR_ID]
                    vor_policy.source = OUP_WEBSITE_ID
                    vor_policy.verbatim = vor_verbatim
                    logger.debug(
                        'Calling j_parser.server_data_match_green_policy(**vor_policy.as_dict())'
                    )
                    j_parser.PolicyMatcher(
                        j_parser,
                        policy_type='green').match(**vor_policy.as_dict())

            else:  # Odd (This td contains the journal name)
                journal_counter += 1
                logger.info('------------({}) Working on journal {}'.format(
                    journal_counter, jname))
                j_parser.name = jname
                j_parser.get_issn_from_romeo()

                # Attempt to find a match in Orpheus
                node_id, node_record, match_type = j_parser.match2node()
                j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match(
                    j_parser, node_id, node_record, match_type)
                logger.debug('j_parser.node_record: {}'.format(
                    j_parser.node_record))
                #determine Orpheus id of preferred name
                if j_parser.node_record['name_status'] not in ['PRIMARY']:
                    preferred_node_id = j_parser.node_record['synonym_of']
                else:
                    preferred_node_id = j_parser.node_id

    logger.info('embargo_data_values: {}'.format(sorted(embargo_data_values)))

コード例 #6

ファイルを表示

ファイル: wiley_parser.py プロジェクト: sejoska/orpheus

def main():
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    source_parser = generic_parser.SourceParser(
        'Wiley Author Compliance Tool',
        url=
        'https://authorservices.wiley.com/author-resources/Journal-Authors/licensing-open-access/open-access/author-compliance-tool.html'
    )
    WILEY_SOURCE_ID = source_parser.match_or_create_source()

    p_parser = generic_parser.NodeParser(name='Wiley',
                                         type='PUBLISHER',
                                         source=WILEY_SOURCE_ID)
    p_node_id, p_node_record, p_match_type = p_parser.match2node()
    p_parser.node_id, p_parser.node_record = generic_parser.act_on_orpheus_match(
        p_parser, p_node_id, p_node_record, p_match_type)

    f = open(os.path.join(BASE_DIR, 'datasets', 'wiley_compliance_tool.html'))
    data = f.read()
    f.close()

    # translation dicts
    oa_status_dict = {
        'Offers OnlineOpen': 'HYBRID',
        'No OA option': 'SUBSCRIPTION',
        'Fully Open Access': 'FULLY_OA'
    }

    default_licence_options = [CCBY_ID, CCBYNC_ID, CCBYNCND_ID]

    licence_choices_dict = {
        'Choice of CC BY, CC BY-NC, or CC BY-NC-ND<br />CC BY for mandated authors':
        default_licence_options,
        '--': [],
        'Choice of CC BY-NC or CC BY-NC-ND<br />CC BY for mandated authors':
        default_licence_options,
        'No CC license offered': [CUSTOM_ID],
        'CC BY': [CCBY_ID],
        'CC BY for mandated authors': [CCBY_ID],
        'CC-BY': [CCBY_ID],
        'CC BY-NC-ND<br />CC BY for mandated authors': [CCBY_ID, CCBYNCND_ID],
        'Choice of CC BY, CC BY-NC, or CC BY-NC-ND':
        default_licence_options,
        'Choice of CC BY, CC BY-NC or CC BY-NC-ND':
        default_licence_options,
        'Choice of CC BY-NC or CC BY-ND-ND<br />CC-BY mandate only':
        default_licence_options,
        'CC BY, CC BY-NC, or CC BY-NC-ND<br />CC BY for mandated authors':
        default_licence_options,
        'CC BY-NC-ND': [CCBYNCND_ID],
        'CC BY, CC BY-NC, CC BY-NC-ND ':
        default_licence_options,
        'CC BY (mandated only), CC BY NC, CC BY NC ND':
        default_licence_options,
        'CC BY-NC<br />CC BY for mandated authors': [CCBY_ID, CCBYNC_ID],
        'CC BY NC ND': [CCBYNCND_ID]
    }

    # journal_block
    t = re.compile(
        r'''<select class="journal" id="journal"(.*)</select>\n</form>''',
        re.DOTALL)
    m = t.search(data)
    journal_block = m.group(1).replace(
        '''"<option value='1132'>Journal of World Intellectual Property - The\n</option>"''',
        "<option value='1132'>Journal of World Intellectual Property - The</option>"
    )
    journals = []
    t = re.compile(r'''^<option value=['"]([0-9]+)['"]>(.+)</option>$''',
                   re.MULTILINE)
    m = t.findall(journal_block)
    for id, journal_name in m:
        journals.append(journal_name)

    # attributes
    oa_stata = javascript_variable('JOAP')
    oa_stata.parse_values(data)

    gold_licences = javascript_variable('JL')
    gold_licences.parse_values(data)

    apcs = javascript_variable('JAPC')
    apcs.parse_values(data)

    preprint_embargos = javascript_variable('JSV')
    preprint_embargos.parse_values(data)

    am_embargos = javascript_variable('JAV')
    am_embargos.parse_values(data)

    # check that the number of values of each attribute matches the number of journals
    for a in [
            oa_stata.values, gold_licences.values, apcs.values,
            preprint_embargos.values, am_embargos.values
    ]:
        if len(a) != len(journals):
            error_msg = 'Number of values of variable ({}) does not match number of journals ({}). This could be because ' \
                        '`Journal of World Intellectual Property` spans more than 1 line in input dataset. Check and, ' \
                        'if so, edit the input to fix that. First 5 values of variable: {}'.format(len(a),
                                                                                                   len(journals), a[0:5])
            sys.exit(error_msg)

    # # print list of values for each variable in input file
    # print_possible_values_of_wiley_variables()

    t_apc_value = re.compile(r'[0-9,]+')
    # parse information for each journal and add to Orpheus

    counter = 0
    for j in journals[1:]:
        counter += 1
        logger.info('---------{} Working on journal {}'.format(counter, j))
        j_parser = generic_parser.NodeParser(
            name=j,
            publisher='Wiley',
            source=WILEY_SOURCE_ID,
            publisher_node_id=p_parser.node_id)
        j_parser.oa_status = oa_status_dict[oa_stata.values[counter]]
        logger.debug('OA status: {}'.format(j_parser.oa_status))

        # obtain issn from romeo; identify romeo_publisher and its node in Orpheus
        j_parser.get_issn_from_romeo()

        # Attempt to find a match in Orpheus
        node_id, node_record, match_type = j_parser.match2node()
        j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match(
            j_parser, node_id, node_record, match_type)
        logger.debug('j_parser.node_record: {}'.format(j_parser.node_record))
        #determine Orpheus id of preferred name
        if j_parser.node_record['name_status'] not in ['PRIMARY']:
            preferred_node_id = j_parser.node_record['synonym_of']
        else:
            preferred_node_id = j_parser.node_id

        # Parsing gold policy info
        licence_options_raw = gold_licences.values[counter]
        logger.debug('licence_options_raw: {}'.format(licence_options_raw))
        apc_raw_str = apcs.values[counter]
        logger.debug('apc_raw_str: {}'.format(apc_raw_str))
        if (j_parser.oa_status
                == 'SUBSCRIPTION') and (licence_options_raw
                                        == '--') and (apc_raw_str == '--'):
            has_gold_policy = False
        else:
            has_gold_policy = True
            logger.debug('has_gold_policy: {}'.format(has_gold_policy))
            gp = generic_parser.GoldPolicyInstance()
            gp.node = preferred_node_id
            gp.source = WILEY_SOURCE_ID
            gp.licence_options = licence_choices_dict[licence_options_raw]
            logger.debug('gp.licence_options: {}'.format(gp.licence_options))
            if apc_raw_str in ['', '--']:
                pass
            elif apc_raw_str in [
                    'No APC', 'Inquire Directly ', 'Contact journal',
                    '$50 per PU', 'waived 2016-18',
                    '$1,800 for research article $900 for technical report'
            ]:
                gp.apc_note = apc_raw_str
                gp.apc_value_min = 900
                gp.apc_value_max = 1800
            elif apc_raw_str.strip() == '3500':
                gp.apc_currency = 'USD'
                gp.apc_value_min = 3500
                gp.apc_value_max = 3500
            else:
                m_apc_value = t_apc_value.search(apc_raw_str)
                gp.apc_value_min = int(m_apc_value.group().replace(',', ''))
                gp.apc_value_max = gp.apc_value_min
                if '$' in apc_raw_str:
                    gp.apc_currency = 'USD'
                elif '€' in apc_raw_str:
                    gp.apc_currency = 'EUR'
                else:
                    logger.warning(
                        'Currency of APC {} could not be recognised. Journal {}'
                        .format(apc_raw_str, j))
            logger.debug('gp.apc_currency: {}'.format(gp.apc_currency))
            logger.debug('gp.apc_value_min: {}'.format(gp.apc_value_min))
            logger.debug('gp.apc_value_max: {}'.format(gp.apc_value_max))

        # parsing preprint policy
        preprint_embargo_raw = preprint_embargos.values[counter]
        if preprint_embargo_raw in [
                'Refer to copyright or contact managing editor', '--'
        ]:
            has_preprint_policy = False
        else:
            has_preprint_policy = True
            preprint_policy = generic_parser.GreenPolicyInstance()
            preprint_policy.node = preferred_node_id
            preprint_policy.outlet = [WEBSITE_ID, INST_REPO_ID, SUBJ_REPO_ID]
            preprint_policy.version = [PREPRINT_ID]
            preprint_policy.version_embargo_months = 0
            preprint_policy.version_green_licence = CUSTOM_ID
            preprint_policy.source = WILEY_SOURCE_ID
            preprint_policy.verbatim = preprint_embargo_raw
            logger.debug('has_preprint_policy: {}'.format(has_preprint_policy))
            logger.debug('preprint_policy.verbatim: {}'.format(
                preprint_policy.verbatim))

        # parsing AM policy
        am_embargo_raw = am_embargos.values[counter]
        logger.debug('am_embargo_raw: {}'.format(am_embargo_raw))
        if am_embargo_raw.strip() in [
                'Refer to copyright or contact managing editor', '--',
                'Refer to copyright',
                'Does not publish unsolicited manuscripts'
        ]:
            has_am_policy = False
        else:
            has_am_policy = True
            am_policy = generic_parser.GreenPolicyInstance()
            am_policy.node = preferred_node_id
            am_policy.outlet = [WEBSITE_ID, INST_REPO_ID, SUBJ_REPO_ID]
            am_policy.version = [AM_ID]
            am_policy.version_green_licence = CUSTOM_ID
            am_policy.source = WILEY_SOURCE_ID
            am_policy.verbatim = am_embargo_raw.strip()
            if am_embargo_raw.strip() in ['Final version on publication']:
                am_policy.version = [VOR_ID]
                am_policy.version_embargo_months = 0
            elif am_embargo_raw.strip() in ['On submission']:
                am_policy.version_embargo_months = 0
            elif am_embargo_raw.strip() in ['6mo embargo']:
                am_policy.version_embargo_months = 6
            elif am_embargo_raw.strip() in [
                    'Institutional repository after 6 month embargo'
            ]:
                am_policy.outlet = [INST_REPO_ID]
                am_policy.version_embargo_months = 6
            elif am_embargo_raw.strip() in [
                    '12mo embargo', '12 months', '12-24mo embargo'
            ]:
                am_policy.version_embargo_months = 12
            elif am_embargo_raw.strip() in ['18mo embargo']:
                am_policy.version_embargo_months = 18
            elif am_embargo_raw.strip() in ['24mo embargo']:
                am_policy.version_embargo_months = 24
            elif am_embargo_raw.strip() in [
                    'Not permitted', 'Fully Open Access'
            ]:
                am_policy.deposit_allowed = False
            else:
                logger.error(
                    'Failed to parse embargo info ({}) for journal {}'.format(
                        am_embargo_raw, j))
            logger.debug('has_am_policy: {}'.format(has_am_policy))
            logger.debug('am_policy.verbatim: {}'.format(am_policy.verbatim))
            logger.debug('am_policy.version_embargo_months: {}'.format(
                am_policy.version_embargo_months))

        # Attach policies to preferred name node
        logger.debug(
            'Calling j_parser.PolicyMatcher(j_parser, policy_type=oa_status).match()'
        )
        j_parser.PolicyMatcher(j_parser, policy_type='oa_status').match()
        if has_gold_policy:
            logger.debug(
                "Calling j_parser.PolicyMatcher(j_parser, policy_type='gold').match(**gp.as_dict())"
            )
            j_parser.PolicyMatcher(j_parser,
                                   policy_type='gold').match(**gp.as_dict())
        if has_preprint_policy:
            logger.debug(
                'Calling j_parser.server_data_match_green_policy(**preprint_policy.as_dict())'
            )
            j_parser.PolicyMatcher(
                j_parser,
                policy_type='green').match(**preprint_policy.as_dict())
        if has_am_policy:
            logger.debug(
                'Calling j_parser.server_data_match_green_policy(**am_policy.as_dict())'
            )
            j_parser.PolicyMatcher(
                j_parser, policy_type='green').match(**am_policy.as_dict())

コード例 #7

ファイルを表示

ファイル: elsevier_apc_parser.py プロジェクト: osc-cam/orpheus

def main():
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    source_parser = generic_parser.SourceParser(
        'Elsevier Open Access Price List',
        url='https://www.elsevier.com/__data/promis_misc/j.custom97.pdf')
    ELSEVIER_SOURCE_ID = source_parser.match_or_create_source()
    elsevier_website_parser = generic_parser.SourceParser(
        'Elsevier website: Open access licenses',
        url='https://www.elsevier.com/about/policies/open-access-licenses')
    ELSEVIER_WEBSITE_ID = elsevier_website_parser.match_or_create_source()

    p_parser = generic_parser.NodeParser(name='Elsevier',
                                         type='PUBLISHER',
                                         source=ELSEVIER_WEBSITE_ID)
    p_node_id, p_node_record, p_match_type = p_parser.match2node()
    p_parser.node_id, p_parser.node_record = generic_parser.act_on_orpheus_match(
        p_parser, p_node_id, p_node_record, p_match_type)

    # gold policy
    gp = generic_parser.GoldPolicyInstance()
    gp.licence_options = [CCBY_ID, CCBYNCND_ID, CUSTOM_ID]

    inputfile = os.path.join(BASE_DIR, 'datasets', 'tabula-j.custom97.csv')

    with open(inputfile, encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        row_counter = 0
        for row in reader:
            row_counter += 1
            issn = row['ISSN'].strip()
            jname = row['Journal title'].replace('\n',
                                                 ' ').replace('  ',
                                                              ' ').strip()
            oastatus = row['OA model'].strip()
            currency = row['Currency'].strip()
            price = row['Price'].strip()

            logger.info('-------({}) Working on {}; oastatus: {}; '
                        'issn: {}; price: {} {}'.format(
                            row_counter, jname, oastatus, issn, currency,
                            price))
            j_parser = generic_parser.NodeParser(
                name=jname,
                publisher='Elsevier',
                source=ELSEVIER_SOURCE_ID,
                publisher_node_id=p_parser.node_id)
            j_parser.issn = issn
            j_parser.oa_status = oastatus.replace('Open Access',
                                                  'FULLY_OA').upper()

            # # identify romeo_publisher and its node in Orpheus
            # j_parser.match2romeo_publisher(test_mode=TEST_MODE)

            # Attempt to find a match in Orpheus
            node_id, node_record, match_type = j_parser.match2node()
            j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match(
                j_parser, node_id, node_record, match_type)
            logger.debug('j_parser.node_record: {}'.format(
                j_parser.node_record))
            #determine Orpheus id of preferred name
            if j_parser.node_record['name_status'] not in ['PRIMARY']:
                preferred_node_id = j_parser.node_record['synonym_of']
            else:
                preferred_node_id = j_parser.node_id

            logger.debug('preferred_node_id: {}'.format(preferred_node_id))
            logger.debug('j_parser: {}'.format(vars(j_parser)))

            # parsing gold policy
            gp.node = preferred_node_id
            gp.apc_currency = currency
            gp.apc_value_min = price
            gp.apc_value_max = price
            gp.source = ELSEVIER_SOURCE_ID

            # Attach policies to preferred name node
            logger.debug(
                'Calling j_parser.PolicyMatcher(j_parser, policy_type=oa_status).match()'
            )
            j_parser.PolicyMatcher(j_parser, policy_type='oa_status').match()
            logger.debug(
                'Calling j_parser.PolicyMatcher(j_parser, policy_type=gold).match()'
            )
            j_parser.PolicyMatcher(j_parser,
                                   policy_type='gold').match(**gp.as_dict())

コード例 #8

ファイルを表示

ファイル: romeo_parser.py プロジェクト: sejoska/orpheus

def main():
    def process_policy(generic_parser_instance, version_list, deposit_allowed):
        generic_parser_instance.PolicyMatcher(
            generic_parser_instance, policy_type='green').match(
                **{
                    'outlet': [PUBMED_ID, INST_REPO_ID, WEBSITE_ID],
                    'version': version_list,
                    'deposit_allowed': deposit_allowed,
                    'source': SOURCE_ID,
                    'verbatim': restrictions_and_conditions,
                    'problematic': True,
                    'vetted': False,
                })

    BASE_DIR = os.path.dirname(os.path.abspath(__file__))

    source_parser = generic_parser.SourceParser(
        'SHERPA/RoMEO', url='http://www.sherpa.ac.uk/romeo/index.php')
    SOURCE_ID = source_parser.match_or_create_source()

    offline_file = os.path.join(BASE_DIR, 'romeo_all_publishers.xml')
    romeo = romeo_client.parser(offline_dataset=offline_file)
    # romeo = romeo_client.parser('?all=yes&showfunder=none&ak=', save_dataset=offline_file)
    romeo.parse_response()
    romeo.convert_restrictions()

    publisher_counter = 0
    for k, v in romeo.output_dict.items():
        publisher_counter += 1
        logger.info('-------------- ({}) Working on romeo_id {}'.format(
            publisher_counter, k))
        if not k in [
                'outcome', 'apicontrol', 'romeo_id_list', 'romeo_issn_list',
                'romeo_publisher_list', 'journals_dicts'
        ]:
            if ('&#' in v['name']) and v['alias']:
                logging.info(
                    'romeo_parser: using {} instead of {} as name for romeo_id {}'
                    .format(v['alias'], v['name'], k))
                name = v['alias']
            else:
                name = v['name']
            parser = generic_parser.NodeParser(name=name,
                                               romeo_id=k,
                                               source=SOURCE_ID,
                                               type='PUBLISHER')
            node_id, node_record = parser.create_node(force_creation=True)
            if not node_id:
                logging.error('Failed to parse romeo id {}'.format(k))
            else:
                restrictions_and_conditions = ''
                if v['prerestriction']:
                    for r in v['prerestriction']:
                        restrictions_and_conditions += 'Restriction on preprint deposit: ' + str(
                            r) + '\n'
                if v['postrestriction']:
                    for r in v['postrestriction']:
                        restrictions_and_conditions += 'Restriction on AAM deposit: ' + str(
                            r) + '\n'
                if v['pdfrestriction']:
                    for r in v['pdfrestriction']:
                        restrictions_and_conditions += 'Restriction on VoR deposit: ' + str(
                            r) + '\n'

                if v['condition']:
                    restrictions_and_conditions += 'Conditions:\n'
                    for c in v['condition']:
                        restrictions_and_conditions += c + '\n'

                if v['prearchiving'].lower() in ['can', 'restricted']:
                    allowed = True
                elif v['prearchiving'].lower() in [
                        'cannot', 'unclear', 'unknown'
                ]:
                    allowed = False
                else:
                    logging.error(
                        'Value of romeo prearchiving field unrecognised: {}'.
                        format(v['prearchiving']))
                    allowed = False

                if v['prearchiving'] == v['postarchiving'] == v['pdfarchiving']:
                    process_policy(parser, [AM_ID, PREPRINT_ID, VOR_ID],
                                   allowed)

                elif (v['prearchiving'] == v['postarchiving']):
                    process_policy(parser, [AM_ID, PREPRINT_ID], allowed)
                    if v['pdfarchiving'].lower() in ['can', 'restricted']:
                        allowed = True
                    elif v['pdfarchiving'].lower() in [
                            'cannot', 'unclear', 'unknown'
                    ]:
                        allowed = False
                    else:
                        logging.error(
                            'Value of romeo pdfarchiving field unrecognised: {}'
                            .format(v['pdfarchiving']))
                        allowed = False
                    process_policy(parser, [VOR_ID], allowed)
                elif (v['prearchiving'] == v['pdfarchiving']):
                    process_policy(parser, [PREPRINT_ID, VOR_ID], allowed)
                    if v['postarchiving'].lower() in ['can', 'restricted']:
                        allowed = True
                    elif v['postarchiving'].lower() in [
                            'cannot', 'unclear', 'unknown'
                    ]:
                        allowed = False
                    else:
                        logging.error(
                            'Value of romeo postarchiving field unrecognised: {}'
                            .format(v['postarchiving']))
                        allowed = False
                    process_policy(parser, [AM_ID], allowed)
                elif (v['pdfarchiving'] == v['postarchiving']):
                    process_policy(parser, [PREPRINT_ID], allowed)
                    if v['postarchiving'].lower() in ['can', 'restricted']:
                        allowed = True
                    elif v['postarchiving'].lower() in [
                            'cannot', 'unclear', 'unknown'
                    ]:
                        allowed = False
                    else:
                        logging.error(
                            'Value of romeo postarchiving field unrecognised: {}'
                            .format(v['postarchiving']))
                        allowed = False
                    process_policy(parser, [AM_ID, VOR_ID], allowed)
                else:
                    for archiving, version_id in [('prearchiving',
                                                   PREPRINT_ID),
                                                  ('postarchiving', AM_ID),
                                                  ('pdfarchiving', VOR_ID)]:
                        if v[archiving].lower() in ['can', 'restricted']:
                            allowed = True
                        elif v[archiving].lower() in [
                                'cannot', 'unclear', 'unknown'
                        ]:
                            allowed = False
                        else:
                            logging.error(
                                'Value of romeo {} field unrecognised: {}'.
                                format(archiving, v[archiving]))
                            allowed = False
                        process_policy(parser, [version_id], allowed)

            # now process each journal
            k_journals = get_journals4id(k)
            journal_counter = 0
            for j in k_journals:
                logger.debug('j: {}'.format(j))
                j_parser = generic_parser.NodeParser(name=j[0],
                                                     issn=j[1],
                                                     eissn=j[2],
                                                     publisher_node_id=node_id,
                                                     source=SOURCE_ID)
                journal_counter += 1
                logger.info('------------({} {}) Working on journal {}'.format(
                    name, journal_counter, j[0]))
                # Attempt to find a match in Orpheus
                j_node_id, j_node_record, match_type = j_parser.match2node()
                j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match(
                    j_parser, j_node_id, j_node_record, match_type)

コード例 #9

ファイルを表示

ファイル: elsevier_embargos_parser.py プロジェクト: sejoska/orpheus

def main():

    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    source_parser = generic_parser.SourceParser('Gray, A. 2018. Elsevier embargo periods, 2013-2018',
                                                url='https://doi.org/10.6084/m9.figshare.1554748.v14')
    ELSEVIER_SOURCE_ID = source_parser.match_or_create_source()
    elsevier_website_parser = generic_parser.SourceParser('Elsevier website: Article Sharing',
                                                url='https://www.elsevier.com/about/policies/sharing')
    ELSEVIER_WEBSITE_ID = elsevier_website_parser.match_or_create_source()

    p_parser = generic_parser.NodeParser(name='Elsevier', type='PUBLISHER', source=ELSEVIER_WEBSITE_ID)
    p_node_id, p_node_record, p_match_type = p_parser.match2node()
    p_parser.node_id, p_parser.node_record = generic_parser.act_on_orpheus_match(p_parser, p_node_id, p_node_record,
                                                                                 p_match_type)

    # preprint policy
    preprint_policy = generic_parser.GreenPolicyInstance()
    preprint_policy.outlet = [INST_REPO_ID, SUBJ_REPO_ID, WEBSITE_ID, COMMERCIAL_ID, PUBMED_ID, SOCIAL_ID]
    preprint_policy.version = [PREPRINT_ID]
    preprint_policy.version_embargo_months = 0
    preprint_policy.version_green_licence = CCBYNCND_ID
    preprint_policy.source = ELSEVIER_WEBSITE_ID
    preprint_policy.verbatim = preprint_verbatim

    # AM policy for personal websites
    am_policy1 = generic_parser.GreenPolicyInstance()
    am_policy1.outlet = [WEBSITE_ID]
    am_policy1.version = [AM_ID]
    am_policy1.version_embargo_months = 0
    am_policy1.version_green_licence = CCBYNCND_ID
    am_policy1.source = ELSEVIER_WEBSITE_ID
    am_policy1.verbatim = am_verbatim

    # AM policy for non-commencial hosting platforms
    am_policy2 = generic_parser.GreenPolicyInstance()
    am_policy2.outlet = [INST_REPO_ID, SUBJ_REPO_ID, PUBMED_ID]
    am_policy2.version = [AM_ID]
    am_policy2.version_green_licence = CCBYNCND_ID
    am_policy2.source = ELSEVIER_SOURCE_ID
    am_policy2.verbatim = am_verbatim

    vor_policy = generic_parser.GreenPolicyInstance()
    vor_policy.outlet = ALL_OUTLETS
    vor_policy.deposit_allowed = False
    vor_policy.version = [VOR_ID]
    vor_policy.source = ELSEVIER_WEBSITE_ID
    vor_policy.verbatim = vor_verbatim

    inputfile = os.path.join(BASE_DIR, 'datasets', 'Elsevier_embargo_periods_by_journal_2013-2018_v_1.14_sheet_UK-2018.csv')

    with open(inputfile, encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        row_counter = 0
        for row in reader:
            row_counter += 1
            oastatus = 'HYBRID'
            issn = row['ISSN'].strip()
            jname = row['Journal Name'].strip()
            aam_embargo = row['Embargo Period (months)'].strip()
            if aam_embargo in ['0 / 12', '0 / 24']: # 0/12 month titles are now OA (no embargo) but 12 months for pre-OA papers
                aam_embargo = '0'
            if aam_embargo == '0':
                oastatus = 'FULLY_OA'

            logger.info('-------({}) Working on journal {}; oastatus: {}; issn: {}; '
                         'AM embargo: {}'.format(row_counter, jname, oastatus, issn, aam_embargo))
            j_parser = generic_parser.NodeParser(name=jname, publisher='Elsevier', source=ELSEVIER_SOURCE_ID,
                                                 publisher_node_id=p_parser.node_id)
            j_parser.issn = issn
            j_parser.oa_status = oastatus

            # # identify romeo_publisher and its node in Orpheus # No need to do this for big publisher datasets
            #j_parser.match2romeo_publisher(test_mode=TEST_MODE)

            # Attempt to find a match in Orpheus
            node_id, node_record, match_type = j_parser.match2node()
            j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match(j_parser, node_id,
                                                                                         node_record, match_type)

            logger.debug('j_parser.node_record: {}'.format(j_parser.node_record))
            #determine Orpheus id of preferred name
            if j_parser.node_record['name_status'] not in ['PRIMARY']:
                preferred_node_id = j_parser.node_record['synonym_of']
            else:
                preferred_node_id = j_parser.node_id

            # parsing green policies
            preprint_policy.node = preferred_node_id
            am_policy1.node = preferred_node_id
            am_policy2.node = preferred_node_id
            am_policy2.version_embargo_months = aam_embargo

            # Attach policies to preferred name node
            logger.debug('Calling j_parser.PolicyMatcher(j_parser, policy_type=oa_status).match()')
            j_parser.PolicyMatcher(j_parser, policy_type='oa_status').match()
            logger.debug('Calling j_parser.server_data_match_green_policy(**preprint_policy.as_dict())')
            # logger.debug('preprint_policy.as_dict():')
            # logger.debug(preprint_policy.as_dict())
            j_parser.PolicyMatcher(j_parser, policy_type='green').match(**preprint_policy.as_dict())
            logger.debug('Calling j_parser.server_data_match_green_policy(**am_policy1.as_dict())')
            j_parser.PolicyMatcher(j_parser, policy_type='green').match(**am_policy1.as_dict())
            logger.debug('Calling j_parser.server_data_match_green_policy(**am_policy2.as_dict())')
            j_parser.PolicyMatcher(j_parser, policy_type='green').match(**am_policy2.as_dict())
            if j_parser.oa_status != 'FULLY_OA':
                vor_policy.node = preferred_node_id
                logger.debug('Calling j_parser.server_data_match_green_policy(**vor_policy.as_dict())')
                j_parser.PolicyMatcher(j_parser, policy_type='green').match(**vor_policy.as_dict())

コード例 #10

ファイルを表示

ファイル: cup_parser.py プロジェクト: sejoska/orpheus

def main():
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    source_parser = generic_parser.SourceParser('CUP APC Price List 2019.04 24.i.2019',
              url=None)
    SOURCE_ID = source_parser.match_or_create_source()

    cup_website_parser = generic_parser.SourceParser('CUP website: Green Open Access Policy for Journals',
              url='https://www.cambridge.org/core/services/open-access-policies/open-access-journals/'
                  'green-open-access-policy-for-journals')
    CUP_WEBSITE_ID = cup_website_parser.match_or_create_source()

    p_parser = generic_parser.NodeParser(name='Cambridge University Press', type='PUBLISHER', source=SOURCE_ID)
    p_node_id, p_node_record, p_match_type = p_parser.match2node()
    p_parser.node_id, p_parser.node_record = generic_parser.act_on_orpheus_match(p_parser, p_node_id, p_node_record,
                                                                                 p_match_type)

    inputfile = os.path.join(BASE_DIR, 'datasets', 'Cambridge-Journals-APC-price-list-2019.04.csv')

    cup2orpheus_status = {
        '': 'SUBSCRIPTION',
        'Hybrid OA': 'HYBRID',
        'No OA': 'SUBSCRIPTION',
        'Full OA': 'FULLY_OA'
    }

    cup_embargo2months = {
        'On acceptance': 0,
        'On Acceptance': 0,
        'On acceptance (SSRN deposit permitted)': 0,
        'On publication': 0,
        "Publisher's version pdf, no sooner than first publication of the article": 0,
        '5 months after publication': 5,
        '6 months after publication': 6,
        '6months after publication': 6,
        "Publisher's version pdf, no sooner than six months after first publication of the article": 6,
        '12 months after acceptance': 12,
        '12 months after publication': 12,
        '13 months after publication': 13,
        'Abstract only plus link to Cambridge site': 999, # DISALLOWED
        'Abstract only in PDF or HTML, no sooner than publication of full article': 999 # DISALLOWED
    }

    cup_licences2orpheus_ids = {
        'CC-BY': CCBY_ID,
        'CC-BY-NC': CCBYNC_ID,
        'CC-BY-NC-SA': CCBYNCSA_ID,
        'CC-BY-NC-ND': CCBYNCND_ID
    }
    tpounds = re.compile('£[0-9,]+')
    tdollars = re.compile('\$[0-9,]+')

    green_combinations = []
    embargo_strings = []
    with open(inputfile, encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        row_counter = 0
        for row in reader:
            row_counter += 1
            # if row_counter < 282:
            #     logger.warning('Skipped; remember to delete or comment out this line later')
            #     continue
            jname = row['Journal'].replace('  ', ' ').strip()
            logger.info('-------({}) Working on journal {}'.format(row_counter, jname))
            jurl = row['URL'].strip()
            issn = row['ISSN'].strip()
            eissn = row['eISSN'].strip()
            oastatus = cup2orpheus_status[row['Open Access status'].strip()]
            apc_data = row['Gold OA APC (plus tax, where applied)'].strip()
            mpounds = tpounds.findall(apc_data)
            mdollars = tdollars.findall(apc_data)
            licence_data = row['Gold OA CC licence options '].strip().split(' / ')

            webpage_AAM = [
                cup_embargo2months[row["Author's personal web page Accepted Manuscript"].strip()],
                [WEBSITE_ID],
                [AM_ID]
                ]
            webpage_VoR = [
                cup_embargo2months[row["Author's personal web page Version of Record"].strip()],
                [WEBSITE_ID],
                [VOR_ID]
                ]

            inst_repo_AAM = [
                cup_embargo2months[row['Departmental web page / Institutional Repository Accepted Manuscript'].strip()],
                [INST_REPO_ID],
                [AM_ID]
                ]
            inst_repo_VoR = [
                cup_embargo2months[row['Departmental web page / Institutional Repository Version of Record'].strip()],
                [INST_REPO_ID],
                [VOR_ID]
                ]

            pmc_AAM = [
                cup_embargo2months[row['Non-commercial Repository / Subject Repository Accepted Manuscript'].strip()],
                [SUBJ_REPO_ID, PUBMED_ID],
                [AM_ID]
                ]
            pmc_VoR = [
                cup_embargo2months[row['Non-commercial Repository / Subject Repository Version of Record'].strip()],
                [SUBJ_REPO_ID, PUBMED_ID],
                [VOR_ID]
                ]

            social_AAM = [
                cup_embargo2months[row['Commercial Repository / Social Media Site Accepted Manuscript'].strip()],
                [SOCIAL_ID],
                [AM_ID]
                ]
            social_VoR = [
                cup_embargo2months[row['Commercial Repository / Social Media Site Version of Record'].strip()],
                [SOCIAL_ID],
                [VOR_ID]
                ]

            # collect all green policy combinations that appear in the dataset
            green_comb = [webpage_AAM[0], webpage_VoR[0], inst_repo_AAM[0], inst_repo_VoR[0],
                                     pmc_AAM[0], pmc_VoR[0], social_AAM[0], social_VoR[0]]
            if green_comb not in green_combinations:
                green_combinations.append(green_comb)

            # continue # uncomment this to produce spreadsheet of all green policy combinations in dataset

            policies_array = optimal_green_policies(green_comb)

            # policies_array = sorted([webpage_AAM, webpage_VoR, inst_repo_AAM, inst_repo_VoR,
            #                          pmc_AAM, pmc_VoR, social_AAM, social_VoR])
            # for e in policies_array[:-1]:
            #     next_e = policies_array[policies_array.index(e)+1] # next item in array
            #     if (e[0] == next_e[0]):
            #     # if (e[0] == next_e[0]) and (e[2] == next_e[2]): # if embargo and version identical to those of next_e
            #         policies_array[policies_array.index(e) + 1][1] += e[1] # add this outlet to next item
            #         policies_array[policies_array.index(e) + 1][2] += e[2] # add this version to next item
            #         policies_array.remove(e) # remove this list item

            # region apc data parsing
            apc_list = []
            apc_currency = None
            apc_value_min = None
            apc_value_max = None
            if mpounds:
                apc_currency = 'GBP'
                for apc in mpounds:
                    apc_list.append(int(apc.replace(',','').replace('£','').strip()))
            elif mdollars:
                apc_currency = 'USD'
                apc_list = []
                for apc in mdollars:
                    apc_list.append(int(apc.replace(',', '').replace('$','').strip()))
            apc_list.sort()
            if apc_list:
                apc_value_min = apc_list[0]
                apc_value_max = apc_list[-1]
            # endregion

            licence_options = []
            for l in licence_data:
                if l:
                    licence_options.append(cup_licences2orpheus_ids[l])


            j_parser = generic_parser.NodeParser(name=jname, publisher='Cambridge University Press', source=SOURCE_ID,
                                                 issn=issn, eissn=eissn, url=jurl, publisher_node_id=p_parser.node_id)
            j_parser.oa_status = oastatus

            # Attempt to find a match in Orpheus
            node_id, node_record, match_type = j_parser.match2node()
            j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match(j_parser, node_id,
                                                                                         node_record, match_type)

            logger.debug('j_parser.node_record: {}'.format(j_parser.node_record))
            #determine Orpheus id of preferred name
            if j_parser.node_record['name_status'] not in ['PRIMARY']:
                preferred_node_id = j_parser.node_record['synonym_of']
            else:
                preferred_node_id = j_parser.node_id

            # parsing OA status
            logger.debug('Calling j_parser.PolicyMatcher(j_parser, policy_type=oa_status).match()')
            j_parser.PolicyMatcher(j_parser, policy_type='oa_status').match()

            # parsing green policies
            preprint = generic_parser.GreenPolicyInstance()
            preprint.outlet = ALL_OUTLETS
            preprint.version = [PREPRINT_ID]
            preprint.version_embargo_months = 0
            preprint.version_green_licence = CUSTOM_ID
            preprint.source = CUP_WEBSITE_ID
            preprint.verbatim = CUP_GREEN_VERBATIM
            preprint.node = preferred_node_id

            logger.debug('Calling j_parser.server_data_match_green_policy(**preprint.as_dict())')
            j_parser.PolicyMatcher(j_parser, policy_type='green').match(**preprint.as_dict())

            for gp in policies_array:
                green = generic_parser.GreenPolicyInstance()

                green.outlet = gp[1]
                green.version = gp[2]
                if gp[0] == 999: # if deposit disallowed
                    green.deposit_allowed = False
                else:
                    green.version_embargo_months = gp[0]
                    green.version_green_licence = CUSTOM_ID
                    green.verbatim = CUP_GREEN_VERBATIM
                green.source = SOURCE_ID
                green.node = preferred_node_id

                logger.debug('Calling j_parser.server_data_match_green_policy(**green.as_dict())')
                j_parser.PolicyMatcher(j_parser, policy_type='green').match(**green.as_dict())

            # parsing gold policy
            if apc_list or licence_data:
                gold = generic_parser.GoldPolicyInstance()

                gold.apc_currency = apc_currency
                gold.apc_value_min = apc_value_min
                gold.apc_value_max = apc_value_max
                gold.source = SOURCE_ID
                gold.licence_options = licence_options
                gold.apc_note = apc_data
                gold.node = preferred_node_id
                logger.debug('Calling j_parser.PolicyMatcher(j_parser, policy_type=gold).match(**gold.as_dict())')
                j_parser.PolicyMatcher(j_parser, policy_type='gold').match(**gold.as_dict())

    pprint(embargo_strings)

    with open('cup_green_combinations.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['webpage_AAM', 'webpage_VoR', 'inst_repo_AAM', 'inst_repo_VoR',
                                     'pmc_AAM', 'pmc_VoR', 'social_AAM', 'social_VoR'])
        for c in green_combinations:
            writer.writerow(c)