Example #1
0
def get_content(suffix, print_data=False):
    """
    From the page ( 'ecolex.org'+ suffix ) we grab the relevant data that is (type, document Type, name, reference, number,
    date, source name and source link, status, subject, keywords, treaty name and link, meeting name and link, website, abstract,
    ...).
    The data is then saved into a dictionary with parameter names as keys and the grabbed result as the value.

    Example:

    data["category"] = "Treaty decision"
    data["name"] = "Decision XXIX_21 _ Membership of the Implementation Committee"

    In the end the dictionary is saved into a json file named (data["name"] without forbidden characters and 
    length limited to 100).json

    :suffix:        the suffix of the url from which we are extracting the data. The suffix string is everything that comes 
                    after the 'ecolex.org'
    :print_data:    Optional parameter that is by default set to False. In case it is set to True, the function will at the end 
                    also print what it managed to extract from the page.

    returns None
    """

    data = dict()

    # We request the page. If the requests was successful we take the content of the page and save it into page_text
    get_page = requests.get(base_link + suffix)
    if get_page.status_code != 200:
        print('Request Denied!', suffix)
    page_text = get_page.text

    #: All the relevant data about the document is saved within <article> tags.
    #: With BeautifulSoup its simple to navigate to that part of the html file.
    soup = BeautifulSoup(page_text, 'html.parser')
    important_text = str(soup.find('article'))

    #: Below are all the parameters and regex patterns that a document might have.

    string_parameters = {
        'documentType': r'Document type.*?dd>(.*?)<',
        'fieldOfApplication': r'Field of application.*?dd>(.*?)<',
        'date': r'Date.*?dd>(.*)<',
        'sourceLink': r'Source.*\s*.*\s*.*?href="(.*?)"',
        'sourceName': r'Source.*\s*.*\s*.*?.*?>(.*?)<',
        'sourceID': r'Source.*\s*.*\s*.*?ID:(.*)\)',
        'title': r'Title.*\s*.*\s*.*?>(.*?)<',
        'placeOfAdoption': r'Place of adoption.*\s*<dd>(.*?)<',
        'depository': r'Depository.*\s*<dd>(.*)<',
        'entryIntoForce': r'Entry into force.*\s*<dd>(.*?)<',
        'subject': r'Subject.*\s*<dd>(.*?)<',
        'geographicalArea': r'Geographical area.*\s*<dd>(.*?)<',
        'abstract': r'p class="abstract">(.*)<\/p>',
        'fullTextLink': r'Full text.*\s*.*\s*<a href="(.*?)"',
        'websiteLink': r'Website.*\s*.*\s*<a href="(.*?)"',
        'website': r'Website.*\s*.*\s*.*\s*.*?>(.*)<',
    }

    list_parameters = {
        'language': r'anguage.*\s*<dd>(.*?)<',
    }

    for parameter_name, regex_pattern in string_parameters.items():
        re_pat = re.compile(regex_pattern)
        data[parameter_name] = get_value_or_none(re_pat, important_text)

    for parameter_name, regex_pattern in list_parameters.items():
        re_pat = re.compile(regex_pattern)
        data[parameter_name] = get_list_or_none(re_pat, important_text)

    data['category'] = 'treaty'

    #: NAME, type : string

    re_name = re.compile(r'<h1>(.*?)<')
    data['name'] = get_value_or_none(re_name, important_text)
    if data['name'] is not None:
        data['name'] = remove_forbidden_characters(data['name'])
    else:
        print('Name of the file not found!', suffix)

    #: KEYWORDS, list of strings

    re_keywords = re.compile(r'span class="tag">(.*)<')
    data['keywords'] = re.findall(re_keywords, important_text)

    #: COUNTRY, ENTRY INTO FORCE, RATIFICATION, SIMPLE SIGNATURE. Will be saved as a list of dicionaries

    participants = soup.find('article').find('section', {'id': 'participants'})

    data['participants'] = None

    if participants is not None:
        table = participants.find('table', {
            'class': 'participants'
        }).find('tbody')

        data['participants'] = []

        for column in table.find_all('tr'):

            country_pattern = {
                'country': r'th>(.*)<',
                'entryIntoForceDate': r'Entry into force date">\s*(.*)',
                'ratificationDate': r'Ratification date".*\s*(.*)',
                'simpleSignatureDate': r'Simple signature date">\s*(.*)',
            }

            column_data = dict()

            for parameter_name, regex_pattern in country_pattern.items():
                re_pat = re.compile(regex_pattern)
                column_data[parameter_name] = get_value_or_none(
                    re_pat, str(column))

            data['participants'].append(column_data)

    ########################################################################
    ########################################################################

    if print_data:
        for key in data:
            print(key + ' : ' + str(data[key]))

    with open('treaty\\' + data['name'][:150] + '.json', 'w') as outfile:
        json.dump(data, outfile)
def get_content(suffix, print_data=False):
    """
    From the page ( 'ecolex.org'+ suffix ) we grab the relevant data that is (type, document Type, name, reference, number,
    date, source name and source link, status, subject, keywords, treaty name and link, meeting name and link, website, abstract,
    ...).
    The data is then saved into a dictionary with parameter names as keys and the grabbed result as the value.

    Example:

    data["category"] = "Treaty decision"
    data["name"] = "Decision XXIX_21 _ Membership of the Implementation Committee"

    In the end the dictionary is saved into a json file named (data["name"] without forbidden characters and 
    length limited to 100).json

    :suffix:        the suffix of the url from which we are extracting the data. The suffix string is everything that comes 
                    after the 'ecolex.org'
    :print_data:    Optional parameter that is by default set to False. In case it is set to True, the function will at the end 
                    also print what it managed to extract from the page.

    returns None
    """

    data = dict()

    # We request the page. If the requests was successful we take the content of the page and save it into page_text
    get_page = requests.get(base_link + suffix)
    if get_page.status_code != 200:
        print('Request Denied!', suffix)
    page_text = get_page.text

    #: All the relevant data about the document is saved within <article> tags.
    #: With BeautifulSoup its simple to navigate to that part of the html file.
    soup = BeautifulSoup(page_text, 'html.parser')
    important_text = str(soup.find('article'))

    #: Below are all the parameters and regex patterns that a document might have.

    string_parameters = {
        'country/Territory' : r'Country\/Territory.*\s.*<dd>(.*?)<',
        'typeOfCourt' : r'Type of court.*\s*<dd>(.*?)<',
        'date' : r'Date.*?<dd>(.*?)<',
        'sourceName' : r'Source.*\s*<dd>(.*?),',
        'sourceLink' : r'Source.*\s*.*href="(.*)"',
        'courtName' : r'Court name.*\s*<dd>(.*)<',
        'seatOfCourt' : r'Seat of court.*\s*<dd>(.*)<',
        'referenceNumber' : r'Reference number.*?<dd>(.*)<',
        'language' : r'Language.*\s*<dd>(.*)<',
        'subject' : r'Subject.*\s*<dd>(.*?)<',
        'abstract' : r'Abstract<\/dt>\s*(.*)',
        'fullTextLink' : r'Full text.*\s*.*?href="(.*?)"',
    }

    list_parameters = {
    }

    for parameter_name, regex_pattern in string_parameters.items():
        re_pat = re.compile(regex_pattern)
        data[parameter_name] = get_value_or_none(re_pat, page_text)

    for parameter_name, regex_pattern in list_parameters.items():
        re_pat = re.compile(regex_pattern)
        data[parameter_name] = get_list_or_none(re_pat, page_text)


    data['category'] = 'jurisprudence'

    #: NAME, type : string

    re_name = re.compile(r'<h1>(.*?)<')
    data['name'] = get_value_or_none(re_name, important_text)
    if data['name'] is not None:
        data['name'] = remove_forbidden_characters(data['name'])
    else:
        print('Name of the file not found!', suffix)

    #: JUDGE, list of judges

    re_judge = re.compile(r'Judge.*?<dd>(.*?)<')
    result = get_value_or_none(re_judge, important_text)
    if result is not None:
        data['judge'] = result.split(';')

    re_keyword = re.compile(r'span class="tag">(.*?)<')
    data['keywords'] = re.findall(re_keyword, important_text)

    ########################################################################
    ########################################################################

    if print_data:
        for key in data:
            print(key  + ' : ' + str(data[key]))
    
    with open('jurisprudence\\' + data['name'][:150] + '.json', 'w') as outfile:
        json.dump(data, outfile)
Example #3
0
def get_content(suffix, print_data=False):
    """
    From the page ( 'ecolex.org'+ suffix ) we grab the relevant data that is (type, document Type, name, reference, number,
    date, source name and source link, status, subject, keywords, treaty name and link, meeting name and link, website, abstract,
    ...).
    The data is then saved into a dictionary with parameter names as keys and the grabbed result as the value.

    Example:

    data["category"] = "Treaty decision"
    data["name"] = "Decision XXIX_21 _ Membership of the Implementation Committee"

    In the end the dictionary is saved into a json file named (data["name"] without forbidden characters and 
    length limited to 100).json

    :suffix:        the suffix of the url from which we are extracting the data. The suffix string is everything that comes 
                    after the 'ecolex.org'
    :print_data:    Optional parameter that is by default set to False. In case it is set to True, the function will at the end 
                    also print what it managed to extract from the page.

    returns None
    """

    data = dict()

    # We request the page. If the requests was successful we take the content of the page and save it into page_text
    get_page = requests.get(base_link + suffix)
    if get_page.status_code != 200:
        print('Request Denied!', suffix)
    page_text = get_page.text

    #: Below are all the parameters and regex patterns that a document might have. Since the pattern can vary drastically
    #: it was easier to do for every parameter one by one.

    string_parameters = {
        'category': r'record-icon">\s*<.*?title="(.*?)"',
        'documentType': r'Document type<\/dt>\s?<dd>(.*?)<',
        'referenceNumber': r'Reference number<\/dt>\s?<dd>(.*?)<',
        'date': r'title="Date">(.*?)<',
        'sourceName': r'Source<\/dt>\s*<dd>\s*(.*?),',
        'sourceLink': r'Source<\/dt>\s*<dd>\s*.*\s*.*?href="(.*?)"',
        'status': r'Status<\/dt>\s?<dd>(.*?)<',
        'treatyName': r'Treaty<\/dt>\s*<dd>\s*.*?>\s*(.*)',
        'meetingName': r'Meeting<\/dt>\s*<dd>\s*.*\s*.*?>(.*?)<',
        'meetingLink': r'Meeting<\/dt>\s*<dd>\s*<a href="(.*?)"',
        'website': r'Website<\/dt>\s*<dd>\s*<a href="(.*?)"',
        'fullTextLink': r'Full text<\/dt>\s*<dd>\s*<a href="(.*?)"',
        'entryIntoForceNotes':
        r'Entry into force notes<\/dt>\s*<dd>(.*?)<\/dd',
    }

    list_parameters = {
        'subject': r'Subject<\/dt>\s*<dd>(.*?)<',
        'country/Territory': r'Country\/Territory<\/dt>\s*<dd>(.*?)<',
        'geographicalArea': r'Geographical area<\/dt>\s*<dd>(.*?)<',
    }

    for parameter_name, regex_pattern in string_parameters.items():
        re_pat = re.compile(regex_pattern)
        data[parameter_name] = get_value_or_none(re_pat, page_text)

    for parameter_name, regex_pattern in list_parameters.items():
        re_pat = re.compile(regex_pattern)
        data[parameter_name] = get_list_or_none(re_pat, page_text)

    # Those are special and are done separately:

    ###: NAME, type : string

    re_name = re.compile(r'<h1>(.*?)<')
    data['name'] = get_value_or_none(re_name, page_text)
    if data['name'] is not None:
        data['name'] = remove_forbidden_characters(data['name'])
    else:
        print('Name of the file not found!', suffix)

    ###: KEYWORD, type : list of strings

    re_keyword = re.compile(r'span class="tag">(.*?)<')
    data['keyword'] = re.findall(re_keyword, page_text)

    ###: TREATY - LINK, type : string

    re_treatyLink = re.compile(r'Treaty<\/dt>\s*<dd>\s*<a href="(.*?)"')
    data['treatyLink'] = get_value_or_none(re_treatyLink, page_text)
    if data['treatyLink'] is not None:
        data['treatyLink'] = base_link + data['treatyLink']

    ###: ABSTRACT, type : string
    #: In the current implementation all html tags are removed from the text. It might make sense to keep the paragraphs tags.

    re_abstract = re.compile(r'Abstract<\/dt>\s*<dd>(.*?)<\/dd')
    abstract_text = get_value_or_none(re_abstract, page_text)

    if abstract_text is not None:

        all_tags = re.compile(r'<.*?>')
        cleaned_text = re.sub(all_tags, '', abstract_text)

        data['abstract'] = cleaned_text
    else:
        data['abstract'] = None

    ########################################################################################
    ########################################################################################
    """
    Below we are extracting the data of references of the document. Since the html structue is much more complex, this
    will be easier to do with the BeautifulSoup library.

    - All the data about the document is written inside <article> tag.
    - All the data about references is written inside <section, id='legislation-references'> tag.
    - References are grouped by their type (Amends, Implements, Implemented by, ...). Every group is saved inside <dl> tag.
    - Inside every <dl> tag, we can find the type of the group in <dt> tag, and then in every <dd> tag that follows, we can grab
      the data of each reference.

    Here we use BeautifulSoup library since with its tools we are able to navigate through html tags and structure easily.

    """

    soup = BeautifulSoup(page_text, 'html.parser')
    ref_section = soup.find('article').find('section',
                                            {'id': 'legislation-references'})
    if ref_section is not None:
        ref_section = ref_section.find_all('dl')

    data['references'] = dict()

    if ref_section is not None:

        for type_reference in ref_section:
            tip = type_reference.dt.text
            data['references'][tip] = []

            for each_reference in type_reference.find_all('dd'):

                reftekst = str(each_reference)

                single_reference = dict()

                ref_string_parameters = {
                    'refLink': r'title">\s*<.*?="(.*?)"',
                    'refName': r'search-result-title">\s*.*\s*.*?>(.*?)<',
                    'refCountry': r'title="Country\/Territory">(.*)<',
                    'refDate': r'title="Date">(.*)',
                    'refSourceLink': r'Source.*\s*.*? href="(.*?)"',
                    'refSourceName': r'Source.*\s*.*?>(.*?)<',
                }

                for parameter_name, regex_pattern in ref_string_parameters.items(
                ):
                    re_pat = re.compile(regex_pattern)
                    single_reference[parameter_name] = get_value_or_none(
                        re_pat, reftekst)

                re_refKeywords = re.compile(r'keywords">(.*?)<')
                single_reference['refKeywords'] = get_list_or_none(
                    re_refKeywords, reftekst)

                data['references'][tip].append(single_reference)

    ########################################################################
    ########################################################################

    if print_data:
        for key in data:
            print(key + ' : ' + str(data[key]))

    with open('legislation\\' + data['name'][:150] + '.json', 'w') as outfile:
        json.dump(data, outfile)
def get_content(suffix, print_data=False):
    """
    From the page ( 'ecolex.org'+ suffix ) we grab the relevant data that is (type, document Type, name, reference, number,
    date, source name and source link, status, subject, keywords, treaty name and link, meeting name and link, website, abstract,
    ...).
    The data is then saved into a dictionary with parameter names as keys and the grabbed result as the value.

    Example:

    data["category"] = "Treaty decision"
    data["name"] = "Decision XXIX_21 _ Membership of the Implementation Committee"

    In the end the dictionary is saved into a json file named (data["name"] without forbidden characters and 
    length limited to 100).json

    :suffix:        the suffix of the url from which we are extracting the data. The suffix string is everything that comes 
                    after the 'ecolex.org'
    :print_data:    Optional parameter that is by default set to False. In case it is set to True, the function will at the end 
                    also print what it managed to extract from the page.

    returns None
    """

    data = dict()

    get_page = requests.get(base_link + suffix)
    if get_page.status_code != 200:
        print('Request Denied!', suffix)
        #: in case request is denied, we can't do anything
    page_text = get_page.text

    #: Below are all the parameters and regex patterns that a document might have. Since the pattern can vary drastically
    #: it was easier to do for every parameter one by one.

    string_parameters = {
        'category': r'record-icon">\s*<.*?title="(.*?)"',
        'documentType': r'Document type<\/dt>\s?<dd>(.*?)<',
        'referenceNumber': r'Reference number<\/dt>\s?<dd>(.*?)<',
        'date': r'Date<\/dt>\s?<dd>(.*?)<',
        'sourceName': r'Source<\/dt>\s?<dd>(.*?),',
        'sourceLink': r'Source<\/dt>\s?<dd>.*?href="(.*?)"',
        'status': r'Status<\/dt>\s?<dd>(.*?)<',
        'treatyName': r'Treaty<\/dt>\s*<dd>\s*.*?>\s*(.*)',
        'meetingName': r'Meeting<\/dt>\s*<dd>\s*.*\s*.*?>(.*?)<',
        'meetingLink': r'Meeting<\/dt>\s*<dd>\s*<a href="(.*?)"',
        'website': r'Website<\/dt>\s*<dd>\s*<a href="(.*?)"',
        'fullTextLink': r'Full text<\/dt>\s*<dd>\s*<a href="(.*?)"',
    }

    list_parameters = {
        'subject': r'Subject<\/dt>\s*<dd>(.*?)<',
        'country/Territory': r'Country\/Territory<\/dt>\s*<dd>(.*?)<',
        'geographicalArea': r'Geographical area<\/dt>\s*<dd>(.*?)<',
    }

    for parameter_name, regex_pattern in string_parameters.items():
        re_pat = re.compile(regex_pattern)
        data[parameter_name] = get_value_or_none(re_pat, page_text)

    for parameter_name, regex_pattern in list_parameters.items():
        re_pat = re.compile(regex_pattern)
        data[parameter_name] = get_list_or_none(re_pat, page_text)

    # Parameters below are special and are done separately:

    #: NAME, type : string

    re_name = re.compile(r'<h1>(.*?)<')
    data['name'] = get_value_or_none(re_name, page_text)
    if data['name'] is not None:
        data['name'] = remove_forbidden_characters(data['name'])
    else:
        print('Name of the file not found!', suffix)

    #: KEYWORD, type : list of strings

    re_keyword = re.compile(r'span class="tag">(.*?)<')
    data['keyword'] = re.findall(re_keyword, page_text)

    #: TREATY - LINK, type : string

    re_treatyLink = re.compile(r'Treaty<\/dt>\s*<dd>\s*<a href="(.*?)"')
    data['treatyLink'] = get_value_or_none(re_treatyLink, page_text)
    if data['treatyLink'] is not None:
        data['treatyLink'] = base_link + data['treatyLink']

    #: ABSTRACT, type : string
    #: At current implementation all the html tags are removed from the text. It might make sense to keep that <p> paragraph tags.

    re_abstract = re.compile(r'Abstract<\/dt>\s*<dd>\s*<div.*?>(.*?)<\/div>')
    abstract_text = get_value_or_none(re_abstract, page_text)

    if abstract_text is not None:

        all_tags = re.compile(r'<.*?>')
        cleaned_text = re.sub(all_tags, '', abstract_text)

        data['abstract'] = cleaned_text
    else:
        data['abstract'] = None

    ########################################################################
    ########################################################################

    if print_data:
        for key in data:
            print(key + ' : ' + str(data[key]))

    with open('treaty decisions\\' + data['name'][:150] + '.json',
              'w') as outfile:
        json.dump(data, outfile)
Example #5
0
def get_content(suffix, print_data=False):
    """
    From the page ( 'ecolex.org'+ suffix ) we grab the relevant data that is (type, document Type, name, reference, number,
    date, source name and source link, status, subject, keywords, treaty name and link, meeting name and link, website, abstract,
    ...).
    The data is then saved into a dictionary with parameter names as keys and the grabbed result as the value.

    Example:

    data["category"] = "Treaty decision"
    data["name"] = "Decision XXIX_21 _ Membership of the Implementation Committee"

    In the end the dictionary is saved into a json file named (data["name"] without forbidden characters and 
    length limited to 100).json

    :suffix:        the suffix of the url from which we are extracting the data. The suffix string is everything that comes 
                    after the 'ecolex.org'
    :print_data:    Optional parameter that is by default set to False. In case it is set to True, the function will at the end 
                    also print what it managed to extract from the page.

    returns None
    """

    data = dict()

    # We request the page. If the requests was successful we take the content of the page and save it into page_text
    get_page = requests.get(base_link + suffix)
    if get_page.status_code != 200:
        print('Request Denied!', suffix)
    page_text = get_page.text

    #: All the relevant data about the document is saved within <article> tags.
    #: With BeautifulSoup its simple to navigate to that part of the html file.
    soup = BeautifulSoup(page_text, 'html.parser')
    important_text = str(soup.find('article'))

    #: Below are all the parameters and regex patterns that a document might have.

    string_parameters = {
        'date' : r'<dt>Date.*\s*<dd>(.*?)<',
        'sourceLink' : r'Source.*\s*.*\s*.*?href="(.*?)"',
        'sourceName' : r'Source.*\s*.*\s*.*?>(.*?)<',
        'sourceID' : r'\(ID:.*?>(.*?)<',
        'publisher' : r'Publisher.*\s*.*\s*(.*)',
        'placePublication' : r'Place of publication.*\s*.*\s*.*\s*\|(.*)',
        'ISBN' : r'ISBN.*\s*<dd>(.*?)<',
        'ISSN' : r'ISSN.*\s*<dd>(.*?)<',
        'pages' : r'Pages.*\s*<dd>(\d*)',
        'documentType' : r'Document type.*\s*<dd>(.*?)<',
        'fullTextLink' : r'Full text.*\s*.*\s*.*?href="(.*?)"',
        'website' : r'Website.*\s*.*\s*<a href="(.*?)"',
        'basin' : r'Basin.*\s*<dd>(.*?)<',
        'fieldOfApplication' : r'Field of application.*\s*<dd>(.*?)<',
        'DOI' : r'DOI.*\s*.*\s*<a href="(.*?)"',
        'journal/series' : r'Journal\/Series.*\s*<dd>\s*(.*\s*\|.*)',

    }

    list_parameters = {
        'author' : r'uthor.*\s*<dd>(.*?)<',
        'language' : r'Language.*\s*<dd>(.*?)<',
        'country/Territory' : r'Country\/Territory.*\s*<dd>(.*?)<',
        'subject' : r'Subject.*\s*<dd>(.*?)<',
        'geographicalArea' : r'Geographical area.*\s*<dd>(.*?)<',

    }

    for parameter_name, regex_pattern in string_parameters.items():
        re_pat = re.compile(regex_pattern)
        data[parameter_name] = get_value_or_none(re_pat, important_text)

    for parameter_name, regex_pattern in list_parameters.items():
        re_pat = re.compile(regex_pattern)
        data[parameter_name] = get_list_or_none(re_pat, important_text)


    data['category'] = 'literature'

    #: NAME, type : string

    re_name = re.compile(r'<h1>(.*?)<')
    data['name'] = get_value_or_none(re_name, important_text)
    if data['name'] is not None:
        data['name'] = remove_forbidden_characters(data['name'])
    else:
        print('Name of the file not found!', suffix)

    #: KEYWORDS : list of strings
    #: Because of the html structure around keywords, we are able to extract all keywords with just re.findall(...)

    re_keyword = re.compile(r'span class="tag">(.*?)<')
    data['keyword'] = re.findall(re_keyword, important_text)

    #: ABSTRACT, type : string

    re_abstract = re.compile(r'class="abstract">(.*)')
    data['abstract'] = get_value_or_none(re_abstract, important_text)

    #: We have two types of references, one is literature references, the other is 
    #:'other references'. In each of them, data for a single reference is saved inside <dl> tag.
    #: With BeautfiulSoup we navigate into each of these <dl> tags and extract data of that reference.
    #: Data is saved into our dictionaty as follows:
    #: - Data['other_references'] = list()
    #: - for each reference we append into that list a dictionary of that reference
    #: - the dictionary is of the same structure as above

    ref_section = soup.find('article').find('section', {'id' : 'other-references'})

    if ref_section is not None:

        data['other_references'] = list()
        
        other_refs = ref_section.find_all('dl')
        for each_reference in other_refs:

            reftext = str(each_reference)

            single_reference = dict()

            ref_string_parameters = {
                'refType' : r'<dt>(.*?)<',
                'refLink' : r'result-title.*\s*.*?href="(.*)"',
                'refName' : r'result-title.*\s*.*\s*title="(.*)"',
                'refDocumentType' : r'Document type">(.*?)<',
                'refPlaceOfAdoption' : r'Place of adoption">(.*?)<',
                'refDate' : r'Date:(.*?)"',
                'refSourceID' : r'source.*\s*.*?ID:(.*?)<',
                'refSourceLink' : r'source.*\s*.*?href="(.*?)"',
                'refSourceName' : r'source.*\s*.*?href.*?>(.*?)<',
            }

            ref_list_parameters = {
                'refKeywords' : r'keywords">(.*?)<',
            }

            for parameter_name, regex_pattern in ref_string_parameters.items():
                re_pat = re.compile(regex_pattern)
                single_reference[parameter_name] = get_value_or_none(re_pat, reftext)

            for parameter_name, regex_pattern in ref_list_parameters.items():
                re_pat = re.compile(regex_pattern)
                single_reference[parameter_name] = get_list_or_none(re_pat, reftext)
            
            data['other_references'].append(single_reference)
    
    ref_section_literature = soup.find('article').find('section', {'id' : 'literature-references'})

    if ref_section_literature is not None:

        data['literature_references'] = []

        literature_references = ref_section_literature.find('dl')

        for each_reference in literature_references:

            reftext = str(each_reference)
            single_reference = dict()

            ref_string_parameters = {
                'refName' : r'result-title.*\s*.*\s*.*?>(.*?)<',
                'refLink' : r'result-title.*\s*.*?href="(.*?)"',
                'refAuthor' : r'uthor:.*\s*.*?>(.*?)<',
                'refPublishedIn' : r'details.*\s*.*?In:.*?span>(.*?)<',
                'refPublishedInWhere' : r'details.*\s*.*In.*\s*\|(.*)',
                'refPublisher' : r'Publisher.*?span>(.*)<',
                'refPublicationPlace' : r'Publication place">(.*)<',
                'refPublicationDate' : r'ublication date">(.*)<',
                'refSourceLink' : r'Source.*\s*.*?href="(.*?)"',
                'refSourceName' : r'Source.*\s*.*?>(.*?)<',
                'refSourceID' : r'result-source.*\s*.*?ID:(.*)\)',
            }

            ref_list_parameters = {
                'refCountryTerritory' : r'Territory">(.*)<',
                'refKeywords' : r'keywords">(.*)<',
            }

            for parameter_name, regex_pattern in ref_string_parameters.items():
                re_pat = re.compile(regex_pattern)
                single_reference[parameter_name] = get_value_or_none(re_pat, reftext)

            for parameter_name, regex_pattern in ref_list_parameters.items():
                re_pat = re.compile(regex_pattern)
                single_reference[parameter_name] = get_list_or_none(re_pat, reftext)

            data['literature_references'].append(single_reference)


    if print_data:
        for key in data:
            print(key  + ' : ' + str(data[key]))

    with open('literature\\' + data['name'][:150] + '.json', 'w') as outfile:
        json.dump(data, outfile)