def sum_assertions(file_index):
    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)

    for line in codecs.open(CURRENT_DIR +'/data/temp/core_'+str(file_index)+'.txt', 'r','utf-8'):
        uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9]
        if uri != 'uri' and context == '/ctx/all':
            weight = float(weight)
            weights[uri] += float(weight)
            assertions[uri] = (rel, start, end, context, weights[uri])
            if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')):
                ccby[uri] = True


    writer_core = MultiWriter('assertion_totals_core')
    #writer_sa = MultiWriter('assertion_totals_sa')
    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
        #else:
            #writer_sa.write(edge)
    writer_core.close()
Example #2
0
def sum_assertions(file_index):
    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)

    for line in codecs.open(CURRENT_DIR +'/data/temp/core_'+str(file_index)+'.txt', 'r','utf-8'):
        uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9]
        if uri != 'uri' and context == '/ctx/all':
            weight = float(weight)
            weights[uri] += float(weight)
            assertions[uri] = (rel, start, end, context, weights[uri])
            if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')):
                ccby[uri] = True


    writer_core = MultiWriter('assertion_totals_core')
    writer_sa = MultiWriter('assertion_totals_sa')
    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
            writer_sa.write(edge)
        else:
            writer_sa.write(edge)
    writer_core.close()
    writer_sa.close()
Example #3
0
def build_core_from_csvs(csv_files):

    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)

    for csv_file in csv_files:
        print "currently in file: " + str(csv_file)
        for line in codecs.open(csv_file, encoding='utf-8'):
            uri, rel, start, end, context, weight, sources, id, dataset = line.split(
                '\t')[:9]
            if uri != 'uri' and context == '/ctx/all':
                weight = float(weight)
                weights[uri] += float(weight)
                assertions[uri] = (rel, start, end, context, weights[uri])
                if not (dataset.startswith('/d/reverb')
                        or dataset.startswith('/d/wiktionary')
                        or dataset.startswith('/d/dbpedia')):
                    ccby[uri] = True

    print 'writing'
    writer_core = MultiWriter('assertion_totals_core')
    #writer_sa = MultiWriter('assertion_totals_sa')

    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation,
                         start,
                         end,
                         dataset,
                         license, ['/s/rule/sum_edges'],
                         '/ctx/all',
                         weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
        #else:
        #writer_sa.write(edge)
    writer_core.close()
Example #4
0
def build_core_from_csvs(csv_files):

    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)


    for csv_file in csv_files:
        print "currently in file: " + str(csv_file)
        for line in codecs.open(csv_file, encoding='utf-8'):
            uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9]
            if uri != 'uri' and context == '/ctx/all':
                weight = float(weight)
                weights[uri] += float(weight)
                assertions[uri] = (rel, start, end, context, weights[uri])
                if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')):
                    ccby[uri] = True

    print 'writing'
    writer_core = MultiWriter('assertion_totals_core')
    #writer_sa = MultiWriter('assertion_totals_sa')

    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
        #else:
            #writer_sa.write(edge)
    writer_core.close()
Example #5
0
    if score <= 0:
        counts['low score'] += 1
        weak_out.write(line)
        continue

    count += 1
    counts['success'] += 1
    good_out.write(line)
    
    if make_json:
        left = make_concept_uri(unicode(left), 'en')
        right = make_concept_uri(unicode(right), 'en')
        edge = make_edge(rel, left, right, '/d/verbosity',
                         '/l/CC/By', sources, surfaceText=text,
                         weight = score/10.0)
        writer.write(edge)


if make_json:
    writer.close()

flag_out.close()
good_out.close()
weak_out.close()
similar_out.close()

simout = open('data/output/similarity-scores.txt', 'w')
for sim in text_similarities:
    print >> simout, sim
simout.close()
for line in codecs.open('data/flat/CORE', encoding='utf-8'):
    uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9]
    if uri != 'uri' and context == '/ctx/all':
        weight = float(weight)
        weights[uri] += float(weight)
        assertions[uri] = (rel, start, end, context, weight)
        if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')):
            ccby[uri] = True

print 'writing'
writer_core = MultiWriter('assertion_totals_core')
#writer_sa = MultiWriter('assertion_totals_sa')

for uri, weight in assertions.iteritems():
    if ccby[uri]:
        license = '/l/CC/By'
        dataset = '/d/conceptnet/5/combined-core'
    else:
        license = '/l/CC/By-SA'
        dataset = '/d/conceptnet/5/combined-sa'
    relation, start, end, context, weight = assertions[uri]
    edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight)
    if license == '/l/CC/By':
        writer_core.write(edge)
    #else:
    #    writer_sa.write(edge)
writer_core.close()
#writer_sa.close()

Example #7
0
    
    score = (freq*2-1) * (1000-orderscore) * (1-sls) / 1000
    if score <= 0:
        counts['low score'] += 1
        weak_out.write(line)
        continue

    count += 1
    counts['success'] += 1
    good_out.write(line)
    
    if make_json:
        edge = make_edge(rel, left, right, '/d/verbosity',
                         '/l/CC/By', sources, surfaceText=text,
                         weight = score/10.0)
        writer.write(edge)


if make_json:
    writer.close()

flag_out.close()
good_out.close()
weak_out.close()
similar_out.close()

simout = open('data/output/similarity-scores.txt', 'w')
for sim in text_similarities:
    print >> simout, sim
simout.close()
Example #8
0
        if not (dataset.startswith('/d/reverb')
                or dataset.startswith('/d/wiktionary')
                or dataset.startswith('/d/dbpedia')):
            ccby[uri] = True

print 'writing'
writer_core = MultiWriter('assertion_totals_core')
#writer_sa = MultiWriter('assertion_totals_sa')

for uri, weight in assertions.iteritems():
    if ccby[uri]:
        license = '/l/CC/By'
        dataset = '/d/conceptnet/5/combined-core'
    else:
        license = '/l/CC/By-SA'
        dataset = '/d/conceptnet/5/combined-sa'
    relation, start, end, context, weight = assertions[uri]
    edge = make_edge(relation,
                     start,
                     end,
                     dataset,
                     license, ['/s/rule/sum_edges'],
                     '/ctx/all',
                     weight=weight)
    if license == '/l/CC/By':
        writer_core.write(edge)
    #else:
    #    writer_sa.write(edge)
writer_core.close()
#writer_sa.close()