Example #1
0
def sum_assertions(file_index):
    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)

    for line in codecs.open(CURRENT_DIR +'/data/temp/core_'+str(file_index)+'.txt', 'r','utf-8'):
        uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9]
        if uri != 'uri' and context == '/ctx/all':
            weight = float(weight)
            weights[uri] += float(weight)
            assertions[uri] = (rel, start, end, context, weights[uri])
            if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')):
                ccby[uri] = True


    writer_core = MultiWriter('assertion_totals_core')
    writer_sa = MultiWriter('assertion_totals_sa')
    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
            writer_sa.write(edge)
        else:
            writer_sa.write(edge)
    writer_core.close()
    writer_sa.close()
Example #2
0
def run_single_process():
    writer = MultiWriter('conceptnet4_nadya')
    raw_assertions = RawAssertion.objects.filter()
    for raw_assertion in raw_assertions:
        edges = handle_raw_assertion(raw_assertion)
        for edge in edges:
            writer.write(edge)
Example #3
0
 def create_processes(self):
     processes = []
     for i in range(self.num_threads):
         writer = MultiWriter(self.writer_name + "_" + str(i))
         p = Process(target=self.pull_lines, args=(self.queue, writer))
         p.daemon = True
         p.start()
         processes.append(p)
     return processes
Example #4
0
def run_single_process():
    writer = MultiWriter('conceptnet4')
    path = "./raw_data/"
    for filename in os.listdir(path):
        for raw_assertion in codecs.open(path + filename,
                                         encoding='utf-8',
                                         errors='replace'):
            edges = handle_raw_assertion(raw_assertion)
            for edge in edges:
                writer.write(edge)
Example #5
0
 def __init__(self):
     self.lang = None
     self.langcode = None
     self.inArticle = False
     self.inTitle = False
     self.curSense = None
     self.curTitle = ''
     self.curText = ''
     self.locales = []
     self.curRelation = None
     self.writer = MultiWriter('wiktionary')
 def __init__(self):
     self.lang = None
     self.langcode = None
     self.inArticle = False
     self.inTitle = False
     self.curSense = None
     self.curTitle = ''
     self.curText = ''
     self.locales = []
     self.curRelation = None
     self.writer = MultiWriter('wiktionary_ja')
     self.nosensetrans = None # non-sense-specific translation
Example #7
0
def run_single_process():
    writer = MultiWriter('conceptnet4_zh')
    path = "./raw_data/"
    for filename in os.listdir(path):
        for line in codecs.open(path + filename,
                                encoding='utf-8',
                                errors='replace'):
            aggregate_assertion(line)
    for assertion, users in assertion_map.items():
        edges = handle_raw_assertion((assertion, users))
        for edge in edges:
            writer.write(edge)
Example #8
0
def build_core_from_csvs(csv_files):

    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)

    for csv_file in csv_files:
        print "currently in file: " + str(csv_file)
        for line in codecs.open(csv_file, encoding='utf-8'):
            uri, rel, start, end, context, weight, sources, id, dataset = line.split(
                '\t')[:9]
            if uri != 'uri' and context == '/ctx/all':
                weight = float(weight)
                weights[uri] += float(weight)
                assertions[uri] = (rel, start, end, context, weights[uri])
                if not (dataset.startswith('/d/reverb')
                        or dataset.startswith('/d/wiktionary')
                        or dataset.startswith('/d/dbpedia')):
                    ccby[uri] = True

    print 'writing'
    writer_core = MultiWriter('assertion_totals_core')
    #writer_sa = MultiWriter('assertion_totals_sa')

    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation,
                         start,
                         end,
                         dataset,
                         license, ['/s/rule/sum_edges'],
                         '/ctx/all',
                         weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
        #else:
        #writer_sa.write(edge)
    writer_core.close()
Example #9
0
counts = defaultdict(int)
text_similarities = []

flag_out = open('data/output/flagged_assertions.txt', 'w')
similar_out = open('data/output/text_similarity.txt', 'w')
weak_out = open('data/output/weak_assertions.txt', 'w')
good_out = open('data/output/ok_assertions.txt', 'w')
sources = ['/s/site/verbosity']




writer = None
if make_json:
    writer = MultiWriter('verbosity')

for line in open('raw_data/verbosity.txt'):

    parts = line.strip().split('\t')
    if not parts:
        counts['blank'] += 1
        continue
    left, relation, right, freq, orderscore = parts[:5]


    # catch bad stuff
    flagged = False

    for rword in right.split():
        if bad_regex_no_biscuit.match(rword):
Example #10
0
        sources = [([creator_node, activity_node], 1)]

        for vote in raw.votes.all():
            sources.append(([normalize_uri('/s/contributor/omcs/'+vote.user.username),
                             normalize_uri(u'/s/activity/omcs/vote')], vote.vote))
        
        for source_list, weight in sources:
            bad = False
            if 'commons2_reject' in ' '.join(source_list):
                weight = -1
            start = make_concept_uri(startText, lang)
            end = make_concept_uri(endText, lang)
            if 'bedume' in ' '.join(source_list):
                for flagged in BEDUME_FLAGGED_CONCEPTS + BEDUME_FLAGGED_PLACES:
                    check = '/'+flagged.replace(' ', '_')
                    if start.endswith(check) or end.endswith(check):
                        bad = True
                        print "flagged:", str(raw)
                        break
            if not bad:
                edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight)
                writer.write(edge)
    except Exception:
        import traceback
        traceback.print_exc()

if __name__ == '__main__':
    writer = MultiWriter('conceptnet4')
    queryset_foreach(RawAssertion.objects.filter(), lambda item: handle_raw_assertion(item, writer))
    writer.close()
Example #11
0
from conceptnet5.nodes import make_concept_uri
from conceptnet5.edges import MultiWriter, make_edge

import yaml
userdata = yaml.load_all(open('./GMUser.yaml'))
users = {}
writer = MultiWriter('globalmind')

lang_codes = {
    'eng': 'en',
    'cht': 'zh_TW',
    'chs': 'zh_CN',
    'jpn': 'ja',
    'kor': 'ko',
    'spa': 'es',
}

lang_names = {
    'eng': 'English',
    'en': 'English',
    'cht': 'Traditional Chinese',
    'zh_TW': 'Traditional Chinese',
    'chs': 'Simplified Chinese',
    'zh_CN': 'Simplified Chinese',
    'jpn': 'Japanese',
    'ja': 'Japanese',
    'kor': 'Korean',
    'ko': 'Korean',
    'spa': 'Spanish',
    'es': 'Spanish'
}
Example #12
0
        dataset = normalize_uri('/d/nadya.jp')
        score = raw.score

        sources = [([activity_node], score / 5.)]

        for source_list, weight in sources:
            if 'commons2_reject' in ' '.join(source_list):
                weight = -1
            start = make_concept_uri(startText, lang)
            end = make_concept_uri(endText, lang)
            edge = make_edge(relation,
                             start,
                             end,
                             dataset,
                             LICENSE,
                             source_list,
                             '/ctx/all',
                             frame_text,
                             weight=weight)
            writer.write(edge)
    except Exception:
        import traceback
        traceback.print_exc()


if __name__ == '__main__':
    writer = MultiWriter('nadya.jp')
    queryset_foreach(RawAssertion.objects.filter(),
                     lambda item: handle_raw_assertion(item, writer))
    writer.close()
Example #13
0
def cycle_writer():
    global writer, WRITER_NUM
    writer.close()
    WRITER_NUM += 1
    writer = MultiWriter('dbpedia.%d' % WRITER_NUM)
Example #14
0
"""
Get data from DBPedia.
"""

__author__ = 'Justin Venezuela ([email protected]), Rob Speer ([email protected])'

from metanl.english import normalize_topic, un_camel_case
from conceptnet5.nodes import make_concept_uri, normalize_uri
from conceptnet5.edges import make_edge, MultiWriter, FlatEdgeWriter
import urllib
import urllib2

source = '/s/web/dbpedia.org'
WRITER_NUM = 1
writer = MultiWriter('dbpedia.%d' % WRITER_NUM)
sw_map = FlatEdgeWriter('data/sw/dbpedia.map.json')
sw_map_used = set()


def cycle_writer():
    global writer, WRITER_NUM
    writer.close()
    WRITER_NUM += 1
    writer = MultiWriter('dbpedia.%d' % WRITER_NUM)


def translate_wp_url(url):
    url = urllib.unquote(url).decode('utf-8', 'ignore')
    return un_camel_case(url.strip('/').split('/')[-1].split('#')[-1])

Example #15
0
    #            disambig = sense_name
    #            break
    #    if disambig is None:
    #        disambig = glossary[synset]
    #if disambig is None:
    #    disambig = '*'
    node = make_concept_uri(synset_name, 'en', pos+'/'+disambig)
    if synset not in mapping:
        mapping[synset] = node

# Map senses to the same nodes.
for sense, synset in sense_synsets.items():
    mapping[sense] = mapping[synset]

sources = ['/s/wordnet/3.0']
writer = MultiWriter('wordnet3')
sw_map = FlatEdgeWriter('data/sw/wordnet30.map.json')
sw_map_used = set()

for line in chain(
    open('raw_data/wordnet-attribute.ttl'),
    open('raw_data/wordnet-causes.ttl'),
    open('raw_data/wordnet-classifiedby.ttl'),
    open('raw_data/wordnet-entailment.ttl'),
    open('raw_data/wordnet-hyponym.ttl'),
    open('raw_data/wordnet-instances.ttl'),
    open('raw_data/wordnet-membermeronym.ttl'),
    open('raw_data/wordnet-partmeronym.ttl'),
    open('raw_data/wordnet-sameverbgroupas.ttl'),
    open('raw_data/wordnet-similarity.ttl'),
    open('raw_data/wordnet-substancemeronym.ttl'),
Example #16
0
from conceptnet.models import *
import os
import codecs
from conceptnet5.nodes import make_concept_uri
from conceptnet5.edges import make_edge, MultiWriter

sparse_pieces = []
for filename in os.listdir('.'):
    if filename.startswith('conceptnet_zh_'):
        writer = MultiWriter(filename.split('.')[0])
        for line in codecs.open(filename, encoding='utf-8', errors='replace'):
            line = line.strip()
            if line:
                parts = line.split(', ')
                user, frame_id, concept1, concept2 = parts
                frame = Frame.objects.get(id=int(frame_id))
                ftext = frame.text
                relation = frame.relation.name
                rel = '/r/' + relation

                surfaceText = ftext.replace(u'{1}',
                                            u'[[' + concept1 + u']]').replace(
                                                u'{2}',
                                                u'[[' + concept2 + u']]')
                start = make_concept_uri(concept1, 'zh_TW')
                end = make_concept_uri(concept2, 'zh_TW')
                sources = [
                    '/s/contributor/petgame/' + user, '/s/activity/ntt/petgame'
                ]
                edge = make_edge(rel,
                                 start,
Example #17
0
 def create_processes(self):
     for i in range(self.num_threads):
         writer = MultiWriter(self.writer_name + "_" + str(i), self.isTest)
         p = Process(target=self.pull_lines, args=(self.queue, writer))
         #p.daemon=True
         p.start()
Example #18
0
            if current_obj is None:
                current_obj = obj
                current_score = obj['weight']
                obj['surfaceRel'] = obj['rel']
            elif obj['weight'] == current_score:
                if normalize(obj['arg1']) == normalize(
                        current_obj['arg1']) and normalize(
                            obj['arg2']) == normalize(current_obj['arg2']):
                    current_obj['rel'] = obj['rel']
                output_edge(current_obj, writer)
                current_obj = None
                current_score = None
            else:
                if current_obj is not None:
                    output_edge(current_obj, writer)
                current_obj = obj
                current_score = obj['weight']
                obj['surfaceRel'] = obj['rel']
    if current_obj is not None:
        output_edge(current_obj, writer)

    writer.close()


if __name__ == '__main__':

    writer = MultiWriter('reverb-wp-frontpage')
    for file_to_read in REVERB_FILES:
        lines = codecs.open(file_to_read, encoding='utf-8', errors='replace')
        handle_lines(lines, writer)
Example #19
0
ccby = defaultdict(bool)

for line in codecs.open('data/flat/CORE', encoding='utf-8'):
    uri, rel, start, end, context, weight, sources, id, dataset = line.split(
        '\t')[:9]
    if uri != 'uri' and context == '/ctx/all':
        weight = float(weight)
        weights[uri] += float(weight)
        assertions[uri] = (rel, start, end, context, weight)
        if not (dataset.startswith('/d/reverb')
                or dataset.startswith('/d/wiktionary')
                or dataset.startswith('/d/dbpedia')):
            ccby[uri] = True

print 'writing'
writer_core = MultiWriter('assertion_totals_core')
#writer_sa = MultiWriter('assertion_totals_sa')

for uri, weight in assertions.iteritems():
    if ccby[uri]:
        license = '/l/CC/By'
        dataset = '/d/conceptnet/5/combined-core'
    else:
        license = '/l/CC/By-SA'
        dataset = '/d/conceptnet/5/combined-sa'
    relation, start, end, context, weight = assertions[uri]
    edge = make_edge(relation,
                     start,
                     end,
                     dataset,
                     license, ['/s/rule/sum_edges'],