예제 #1
0
def summarize_titles_data(data, fieldnames):

    global link_person_gnd
    link_person_gnd = []
    global titles
    titles = []
    global authors
    authors = []
    global labels
    labels = []
    if(PERSONS_JSON in data):
        for person in data[PERSONS_JSON]:
            name = common.toByteStr(person[NAME_JSON])
            authors.append(name)
            if(SAMEAS_JSON in person):
                 links = [link for link in person[SAMEAS_JSON]]
                 link_person_gnd_tmp = [link for link in links
                                         if GND_JSON in link]
                 link_person_gnd.append(' '.join(link_person_gnd_tmp))


    if('terms' in data):
        for subject in data['terms']:
            if('labels' in subject):
                labels = [label for label in subject['labels']]

    if('notes' in data):
        titles = data['notes']
        print data['notes']

    print 'author: ', ' '.join(map(str,authors))
    print 'labels: ', ' '.join(map(str,labels))

    titles_res = ''
    str_list = []
    for elem in titles:
        byte_str = common.toByteStr(elem)
        titles_res = titles_res + byte_str + ' '
        str_list.append(byte_str)
    print 'titles out: ', titles_res

    print 'en translation: ', gs.translate(titles_res, 'en')

    #entry = {
    #    'gnd': ' '.join(map(str,link_person_gnd)),
    #    'author': ' '.join(map(str,authors)),
    #    'subject': ' '.join(map(str,labels)),
    #    'title': titles_res
    #}

    values = [
        ' '.join(map(str,link_person_gnd))
        , ' '.join(map(str,authors))
        , ' '.join(map(str,labels))
        , titles_res
    ]

    entry = dict(zip(fieldnames, values))
    return entry
예제 #2
0
def summarize_titles_data(data, fieldnames):

    global link_person_gnd
    link_person_gnd = []
    global titles
    titles = []
    global authors
    authors = []
    global labels
    labels = []
    if (PERSONS_JSON in data):
        for person in data[PERSONS_JSON]:
            name = common.toByteStr(person[NAME_JSON])
            authors.append(name)
            if (SAMEAS_JSON in person):
                links = [link for link in person[SAMEAS_JSON]]
                link_person_gnd_tmp = [
                    link for link in links if GND_JSON in link
                ]
                link_person_gnd.append(' '.join(link_person_gnd_tmp))

    if ('terms' in data):
        for subject in data['terms']:
            if ('labels' in subject):
                labels = [label for label in subject['labels']]

    if ('notes' in data):
        titles = data['notes']
        print data['notes']

    print 'author: ', ' '.join(map(str, authors))
    print 'labels: ', ' '.join(map(str, labels))

    titles_res = ''
    str_list = []
    for elem in titles:
        byte_str = common.toByteStr(elem)
        titles_res = titles_res + byte_str + ' '
        str_list.append(byte_str)
    print 'titles out: ', titles_res

    print 'en translation: ', gs.translate(titles_res, 'en')

    #entry = {
    #    'gnd': ' '.join(map(str,link_person_gnd)),
    #    'author': ' '.join(map(str,authors)),
    #    'subject': ' '.join(map(str,labels)),
    #    'title': titles_res
    #}

    values = [
        ' '.join(map(str, link_person_gnd)), ' '.join(map(str, authors)),
        ' '.join(map(str, labels)), titles_res
    ]

    entry = dict(zip(fieldnames, values))
    return entry
예제 #3
0
def calculate_musicbrainz_works_and_recordings_by_id(id, author, output_file):

    try:
#        query_work = MUSICBRAINZ_API_URL + 'artist/' + id + '?inc=aliases%20works%20recordings&fmt=json'
        query_work = MUSICBRAINZ_API_URL + 'work?artist=' + id + '&inc=aliases&fmt=json'
        print 'query compositions:', query_work
        work_response = common.process_http_query(query_work)
        print 'musicbrainz composition:', work_response
        musicbrainz_composition_response_json = json.loads(work_response.content)
#        works_count = len(musicbrainz_composition_response_json[common.WORKS_JSON])
        compositions_count = str(musicbrainz_composition_response_json[common.WORK_COUNT_JSON])
        #recordings_count = len(musicbrainz_composition_response_json[common.RECORDINGS_JSON])
        #compositions_count = works_count + recordings_count
        print 'musicbrainz #composition:', compositions_count
        values = [
            id
            , common.toByteStr(author)
            , str(compositions_count)
        ]

        entry = dict(zip(common.musicbrainz_compositions_count_fieldnames, values))
        with open(output_file, 'ab') as csvfile:
            writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=common.musicbrainz_compositions_count_fieldnames, lineterminator='\n')
            writer.writerow(entry)
    except ValueError as ve:
        print 'Could not find JSON for given Musicbrainz composition.', id, ve.message
    except Exception as e:
        print 'Could not find Musicbrainz composition.', id, e.message

    return work_response
예제 #4
0
def aggregate_compositions_data():

    with codecs.open(COMPOSITIONS_DATA_FILE, 'w') as csvfile:
        writer = csv.DictWriter(csvfile,
                                delimiter=';',
                                fieldnames=composition_data_fieldnames,
                                lineterminator='\n')
        writer.writeheader()

        for inputfile in glob.glob(FREEBASE_COMPOSITIONS_DIR + common.SLASH +
                                   '*'):
            print inputfile
            compositions_content_json = common.read_json_file(inputfile)
            composition_json_list = compositions_content_json['result'][0][
                'compositions']
            composition_list = get_composition_id_list_from_json_list(
                composition_json_list)

            if len(composition_list) > 0:
                for index, composition_id in enumerate(composition_list):
                    composition_data = retrieve_compositions_data(
                        composition_id)
                    if composition_data:
                        try:
                            mid = composition_data['result'][0]['mid']
                            name = composition_data['result'][0]['name']
                            entry = build_composition_data_entry(
                                composition_id, mid, common.toByteStr(name))
                            writer.writerow(entry)
                        except:
                            print 'Composition values mid and/or name is empty.'
예제 #5
0
def summarize_sameas_data(data, fieldnames):

    entries = []
    onb_id = ''

    if(DOC_ID_JSON in data):
        onb_id = data[DOC_ID_JSON]
        print data[DOC_ID_JSON]

    if(PERSONS_JSON in data):
        for person in data[PERSONS_JSON]:
            author = person[NAME_JSON]
            links = []
            link_person_gnd = []
            if(SAMEAS_JSON in person):
                 links = [link for link in person[SAMEAS_JSON]]
                 link_person_gnd = [link for link in links
                                         if GND_JSON in link]

            gnd = ''
            if(len(link_person_gnd) > 0):
                gnd = link_person_gnd[0]

            values = [
                onb_id
                , common.toByteStr(author)
                , gnd
                , ' '.join(map(str,links))
            ]
            entry = dict(zip(fieldnames, values))
            entries.append(entry)

    return entries
예제 #6
0
def summarize_sameas_data(data, fieldnames):

    entries = []
    onb_id = ''

    if (DOC_ID_JSON in data):
        onb_id = data[DOC_ID_JSON]
        print data[DOC_ID_JSON]

    if (PERSONS_JSON in data):
        for person in data[PERSONS_JSON]:
            author = person[NAME_JSON]
            links = []
            link_person_gnd = []
            if (SAMEAS_JSON in person):
                links = [link for link in person[SAMEAS_JSON]]
                link_person_gnd = [link for link in links if GND_JSON in link]

            gnd = ''
            if (len(link_person_gnd) > 0):
                gnd = link_person_gnd[0]

            values = [
                onb_id,
                common.toByteStr(author), gnd, ' '.join(map(str, links))
            ]
            entry = dict(zip(fieldnames, values))
            entries.append(entry)

    return entries
예제 #7
0
def get_composition_id_list_from_json_list(composition_json_list):

    composition_list = []
    for composition_json in composition_json_list:
        if composition_json['id'] != None:
            composition_str = common.toByteStr(composition_json['id']).lower()
            if composition_str != None:
                composition_list.append(composition_str)
    return sorted(composition_list)
예제 #8
0
def get_composition_id_list_from_json_list(composition_json_list):

    composition_list = []
    for composition_json in composition_json_list:
        if composition_json['id'] != None:
            composition_str = common.toByteStr(composition_json['id']).lower()
            if composition_str != None:
                composition_list.append(composition_str)
    return sorted(composition_list)
예제 #9
0
def summarize_authors_data(data, fieldnames):

    entries = []
    onb_id = ''

    if(DOC_ID_JSON in data):
        onb_id = data[DOC_ID_JSON]
        print data[DOC_ID_JSON]

    if(PERSONS_JSON in data):
        #isFirstTime = True
        for person in data[PERSONS_JSON]:
            author = person[NAME_JSON]
            link_person_gnd = []
            if(SAMEAS_JSON in person):
                 links = [link for link in person[SAMEAS_JSON]]
                 link_person_gnd = [link for link in links
                                         if GND_JSON in link]

            dbpedia_items = dbpedia_helper.find_dbpedia_items(author)
            dbpedia_id_res = ''
            for key, value in dbpedia_items.iteritems():
                dbpedia_id = dbpedia_helper.find_dbpedia_id(key)
                dbpeida_id_str = common.toByteStr(dbpedia_id)
                print 'DBPedia ID', dbpeida_id_str
                dbpedia_id_res = dbpeida_id_str + ' ' + dbpedia_id_res

            #if(isFirstTime == False):
            #    onb_id = ''
            gnd = ''
            if(len(link_person_gnd) > 0):
                gnd = link_person_gnd[0]

            values = [
                onb_id
                , common.toByteStr(author)
                , gnd
                , dbpedia_id_res
            ]
            entry = dict(zip(fieldnames, values))
            entries.append(entry)
            #isFirstTime = False

    return entries
예제 #10
0
def store_composition_musicbrainz(id, json_data, author, output_file):

    alias_names = ''
    for name_list in json_data[common.ALIASES_JSON]:
        value = name_list[common.NAME_JSON]
        if alias_names=='':
            alias_names = value
        else:
            alias_names += ' ' + value

    values = [
        id
        , common.toByteStr(author)
        , common.toByteStr(json_data[common.TITLE_JSON])
        , common.toByteStr(alias_names)
    ]

    entry = dict(zip(common.musicbrainz_works_and_recordings_fieldnames, values))
    write_composition_in_csv_file(output_file, entry)
예제 #11
0
def build_viaf_composition_entry(
        author_id, author_name, composition_id, title):

    values = [
        author_id
        , author_name
        , composition_id
        , common.toByteStr(title)
    ]

    return dict(zip(common.viaf_compositions_fieldnames, values))
예제 #12
0
def extract_property_label(response, property):

    values = ''
    try:
        json_data = response
        values = json_data['entities']['P'+str(property)]['labels']['en']['value']
    except JSONDecodeError as jde:
        print 'JSONDecodeError. Response property data:', response, jde
    except:
        print 'Response json:', response
        print 'Unexpected error:', sys.exc_info()[0]
    print 'property:', property, 'values:', values
    return common.toByteStr(values)
예제 #13
0
def summarize_authors_data(data, fieldnames):

    entries = []
    onb_id = ''

    if (DOC_ID_JSON in data):
        onb_id = data[DOC_ID_JSON]
        print data[DOC_ID_JSON]

    if (PERSONS_JSON in data):
        #isFirstTime = True
        for person in data[PERSONS_JSON]:
            author = person[NAME_JSON]
            link_person_gnd = []
            if (SAMEAS_JSON in person):
                links = [link for link in person[SAMEAS_JSON]]
                link_person_gnd = [link for link in links if GND_JSON in link]

            dbpedia_items = dbpedia_helper.find_dbpedia_items(author)
            dbpedia_id_res = ''
            for key, value in dbpedia_items.iteritems():
                dbpedia_id = dbpedia_helper.find_dbpedia_id(key)
                dbpeida_id_str = common.toByteStr(dbpedia_id)
                print 'DBPedia ID', dbpeida_id_str
                dbpedia_id_res = dbpeida_id_str + ' ' + dbpedia_id_res

            #if(isFirstTime == False):
            #    onb_id = ''
            gnd = ''
            if (len(link_person_gnd) > 0):
                gnd = link_person_gnd[0]

            values = [onb_id, common.toByteStr(author), gnd, dbpedia_id_res]
            entry = dict(zip(fieldnames, values))
            entries.append(entry)
            #isFirstTime = False

    return entries
예제 #14
0
def extract_property_label(response, property):

    values = ''
    try:
        json_data = response
        values = json_data['entities']['P' +
                                       str(property)]['labels']['en']['value']
    except JSONDecodeError as jde:
        print 'JSONDecodeError. Response property data:', response, jde
    except:
        print 'Response json:', response
        print 'Unexpected error:', sys.exc_info()[0]
    print 'property:', property, 'values:', values
    return common.toByteStr(values)
예제 #15
0
def save_mapping_authors_to_composition_count_in_csv(filename_authors, outputfile):

    reader = csv.DictReader(open(filename_authors), delimiter=';', fieldnames=common.viaf_compositions_count_fieldnames, lineterminator='\n')
    firstTime = True
    for row in reader:
        if not firstTime:
            print 'row', row
            author = row[common.AUTHOR_NAME]
            author_id = author.split('.')[0]
            name, length = count_compositions(author_id)
            if name != None:
                print 'author:', name, 'len compositions', length
                entry = build_freebase_composition_count_entry(common.toByteStr(name), length)
                write_composition_in_csv_file(outputfile, entry)
        else:
            firstTime = False
예제 #16
0
def analyze_compositions():

    with codecs.open(AUTHOR_COMPOSITIONS_FILE, 'w') as csvfile:
        writer = csv.DictWriter(csvfile,
                                delimiter=';',
                                fieldnames=author_composition_fieldnames,
                                lineterminator='\n')
        writer.writeheader()

        for inputfile in glob.glob(FREEBASE_COMPOSITIONS_DIR + common.SLASH +
                                   '*'):
            print inputfile
            compositions_content_json = common.read_json_file(inputfile)
            name = compositions_content_json['result'][0]['name']
            composition_json_list = compositions_content_json['result'][0][
                'compositions']
            composition_list = get_composition_string_list_from_json_list(
                composition_json_list)

            if len(composition_list) > 0:
                #                parent = assign_parent(composition_list[0])

                parent = ''
                for index, composition in enumerate(composition_list):
                    main = composition
                    if index == 0:
                        parent = assign_parent(composition_list[0])
                    else:
                        #if parent not in composition:
                        if not composition.startswith(parent):
                            parent = assign_parent(composition)
                        else:
                            parent_new = common.find_common_substring(
                                parent, composition_list[index - 1])
                            #parent_new = common.find_common_parent(parent,composition_list[index-1])
                            # parent ending must be either ' ' or ','
                            if parent_new != '':
                                print 'parent:', parent, 'parent_new:', parent_new, 'composition:', composition
                                if (len(parent_new) <= len(composition)
                                    and composition[len(parent_new)-1] != ' ' \
                                    and composition[len(parent_new)-1] != ','):
                                    parent_new = composition
                                parent = parent_new
                    entry = build_author_composition_entry(
                        common.toByteStr(name), composition, parent, main)
                    writer.writerow(entry)
예제 #17
0
def extract_property_value(response, property):

    values = ''
    try:
        json_data = response
        for value_list in json_data['P' + str(property)]:
            value = value_list['mainsnak']['datavalue']['value']
            if values == '':
                values = value
            else:
                values = values + ' ' + value

            #property_data_list = json_data[PROPS_JSON][str(property)]
            #values = " ".join(str(value_list[VALUE_POS_IN_WIKIDATA_PROP_LIST]) for value_list in property_data_list)
    except JSONDecodeError as jde:
        print 'JSONDecodeError. Response author data:', response, jde
    except:
        print 'Response json:', response
        print 'Unexpected error:', sys.exc_info()[0]
    print 'property:', property, 'values:', values
    return common.toByteStr(values)
예제 #18
0
def extract_property_value(response, property):

    values = ''
    try:
        json_data = response
        for value_list in json_data['P' + str(property)]:
            value = value_list['mainsnak']['datavalue']['value']
            if values=='':
                values = value
            else:
                values = values + ' ' + value

            #property_data_list = json_data[PROPS_JSON][str(property)]
            #values = " ".join(str(value_list[VALUE_POS_IN_WIKIDATA_PROP_LIST]) for value_list in property_data_list)
    except JSONDecodeError as jde:
        print 'JSONDecodeError. Response author data:', response, jde
    except:
        print 'Response json:', response
        print 'Unexpected error:', sys.exc_info()[0]
    print 'property:', property, 'values:', values
    return common.toByteStr(values)
예제 #19
0
def save_mapping_authors_to_composition_count_in_csv(filename_authors,
                                                     outputfile):

    reader = csv.DictReader(
        open(filename_authors),
        delimiter=';',
        fieldnames=common.viaf_compositions_count_fieldnames,
        lineterminator='\n')
    firstTime = True
    for row in reader:
        if not firstTime:
            print 'row', row
            author = row[common.AUTHOR_NAME]
            author_id = author.split('.')[0]
            name, length = count_compositions(author_id)
            if name != None:
                print 'author:', name, 'len compositions', length
                entry = build_freebase_composition_count_entry(
                    common.toByteStr(name), length)
                write_composition_in_csv_file(outputfile, entry)
        else:
            firstTime = False
예제 #20
0
def analyze_compositions():

    with codecs.open(AUTHOR_COMPOSITIONS_FILE, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=author_composition_fieldnames, lineterminator='\n')
        writer.writeheader()

        for inputfile in glob.glob(FREEBASE_COMPOSITIONS_DIR + common.SLASH + '*'):
            print inputfile
            compositions_content_json = common.read_json_file(inputfile)
            name = compositions_content_json['result'][0]['name']
            composition_json_list = compositions_content_json['result'][0]['compositions']
            composition_list = get_composition_string_list_from_json_list(composition_json_list)

            if len(composition_list) > 0:
#                parent = assign_parent(composition_list[0])

                parent = ''
                for index, composition in enumerate(composition_list):
                    main = composition
                    if index == 0:
                        parent = assign_parent(composition_list[0])
                    else:
                        #if parent not in composition:
                        if not composition.startswith(parent):
                            parent = assign_parent(composition)
                        else:
                            parent_new = common.find_common_substring(parent,composition_list[index-1])
                            #parent_new = common.find_common_parent(parent,composition_list[index-1])
                            # parent ending must be either ' ' or ','
                            if parent_new != '':
                                print 'parent:', parent, 'parent_new:', parent_new, 'composition:', composition
                                if (len(parent_new) <= len(composition)
                                    and composition[len(parent_new)-1] != ' ' \
                                    and composition[len(parent_new)-1] != ','):
                                    parent_new = composition
                                parent = parent_new
                    entry = build_author_composition_entry(common.toByteStr(name), composition, parent, main)
                    writer.writerow(entry)
예제 #21
0
def extract_and_save_label_data(europeana_response_json):

    fields = []
    try:
        fields = europeana_response_json[FACETS_JSON][0]
        with codecs.open(FACET_COLLECTION_FILE, 'w') as csvfile:
            writer = csv.DictWriter(csvfile,
                                    delimiter=';',
                                    fieldnames=facet_collection_fieldnames,
                                    lineterminator='\n')
            writer.writeheader()
        for field in fields[FIELDS_JSON]:
            label = field[LABEL_JSON]
            count = field[COUNT_JSON]
            with open(FACET_COLLECTION_FILE, 'ab') as csvfile:
                writer = csv.DictWriter(csvfile,
                                        delimiter=';',
                                        fieldnames=facet_collection_fieldnames,
                                        lineterminator='\n')
                try:
                    id = ''
                    if '_' in label:
                        id = label.split('_')[0]
                    print 'label:', label, 'id:', id, 'count:', count
                    label_res = common.toByteStr(label)
                    values = [str(id), label_res, str(count)]
                    entry = dict(zip(facet_collection_fieldnames, values))
                    writer.writerow(entry)
                except UnicodeEncodeError as uee:
                    print 'UnicodeEncodeError. Writing data in CSV for europeana facet collection. label:', label, uee

    except JSONDecodeError as jde:
        print 'JSONDecodeError. Response europeana facet collection data:', jde
    except:
        print 'Response json:', europeana_response_json
        print 'Unexpected error:', sys.exc_info()[0]
    return fields
예제 #22
0
def aggregate_compositions_data():

    with codecs.open(COMPOSITIONS_DATA_FILE, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=composition_data_fieldnames, lineterminator='\n')
        writer.writeheader()

        for inputfile in glob.glob(FREEBASE_COMPOSITIONS_DIR + common.SLASH + '*'):
            print inputfile
            compositions_content_json = common.read_json_file(inputfile)
            composition_json_list = compositions_content_json['result'][0]['compositions']
            composition_list = get_composition_id_list_from_json_list(composition_json_list)

            if len(composition_list) > 0:
                for index, composition_id in enumerate(composition_list):
                    composition_data = retrieve_compositions_data(composition_id)
                    if composition_data:
                        try:
                            mid = composition_data['result'][0]['mid']
                            name = composition_data['result'][0]['name']
                            entry = build_composition_data_entry(composition_id, mid, common.toByteStr(name))
                            writer.writerow(entry)
                        except:
                            print 'Composition values mid and/or name is empty.'