Example #1
0
def handle_raw_assertion(line):
    parts = line.split(', ')
    user, frame_id, concept1, concept2 = parts
    fdata = FRAME_DATA[frame_id]
    ftext = fdata['text']
    rel = fdata['relation']

    surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace(
        '{2}', '[[' + concept2 + ']]'
    )
    # We mark surface texts with * if {2} comes before {1}.
    if ftext.find('{2}') < ftext.find('{1}'):
        surfaceText = '*' + surfaceText

    start = standardized_concept_uri('zh_TW', concept1)
    end = standardized_concept_uri('zh_TW', concept2)
    source = {
        'contributor': '/s/contributor/petgame/' + user,
        'activity': '/s/activity/ptt/petgame',
    }
    yield make_edge(
        rel,
        start,
        end,
        dataset='/d/conceptnet/4/zh',
        license=Licenses.cc_attribution,
        sources=[source],
        surfaceText=surfaceText,
        weight=1,
    )
Example #2
0
    def output_monolingual(self, lang, relation, term1, term2):
        # skip Wiktionary: links and templates
        if u'ウィク' in term1 or u'ウィク' in term2:
            return
        if u'テンプレート' in term1 or u'テンプレート' in term2:
            return

        if lang in LANGUAGES_3_TO_2:  # convert 3-letter code to 2-letter code
            lang = LANGUAGES_3_TO_2[lang]
        source = make_concept_uri_safe(term1, lang)
        if self.pos:
            target = make_concept_uri_safe(term2, lang, self.pos)
        else:
            target = make_concept_uri_safe(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        #print surfaceText

        edge = make_edge('/r/' + relation,
                         source,
                         target,
                         '/d/wiktionary/ja/%s' % (lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Example #3
0
def handle_raw_assertion(line):
    parts = line.split(', ')
    user, frame_id, concept1, concept2 = parts
    fdata = FRAME_DATA[frame_id]
    ftext = fdata['text']
    rel = fdata['relation']

    surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace(
        '{2}', '[[' + concept2 + ']]')
    # We mark surface texts with * if {2} comes before {1}.
    if ftext.find('{2}') < ftext.find('{1}'):
        surfaceText = '*' + surfaceText

    start = standardized_concept_uri('zh_TW', concept1)
    end = standardized_concept_uri('zh_TW', concept2)
    source = {
        'contributor': '/s/contributor/petgame/' + user,
        'activity': '/s/activity/ptt/petgame'
    }
    yield make_edge(rel,
                    start,
                    end,
                    dataset='/d/conceptnet/4/zh',
                    license=Licenses.cc_attribution,
                    sources=[source],
                    surfaceText=surfaceText,
                    weight=1)
def output_assertion(out, **kwargs):
    uri = kwargs.pop('uri')
    source_tree = make_disjunction_uri(set(kwargs.pop('sources')))
    assertion = make_edge(sources=source_tree, **kwargs)
    assert assertion['uri'] == uri, (assertion['uri'], uri)
    line = json.dumps(assertion, ensure_ascii=False)
    print >> out, line
Example #5
0
def handle_triple(line):
    items = line.split()
    for i in xrange(3):
        if not (items[i].startswith('<') and items[i].endswith('>')):
            return
        items[i] = items[i][1:-1]
    subj, pred, obj = items[:3]
    if 'foaf/0.1/homepage' in pred or obj == 'work' or '_Feature' in obj or '#Thing' in obj or '__' in subj or '__' in obj or 'List_of' in subj or 'List_of' in obj: return
    subj_concept = make_concept_uri(translate_wp_url(subj), 'en')
    obj_concept = make_concept_uri(translate_wp_url(obj), 'en')
    webrel = map_web_relation(pred)
    if webrel is None:
        return
    rel = normalize_uri('/r/'+webrel)

    if (pred, rel) not in sw_map_used:
        sw_map_used.add((pred, rel))
        sw_map.write({'from': pred, 'to': rel})
    if (subj, subj_concept) not in sw_map_used:
        sw_map_used.add((subj, subj_concept))
        sw_map.write({'from': subj, 'to': subj_concept})
    if (obj, obj_concept) not in sw_map_used:
        sw_map_used.add((obj, obj_concept))
        sw_map.write({'from': obj, 'to': obj_concept})

    edge = make_edge(rel, subj_concept, obj_concept,
                     dataset='/d/dbpedia/en',
                     license='/l/CC/By-SA',
                     sources=['/s/dbpedia/3.7'],
                     context='/ctx/all',
                     weight=0.5)
    writer.write(edge)
def handle_raw_db_assertion(raw_assertion):
    try:
        if can_skip(raw_assertion):
            return []

        # build the assertion
        frame_text = build_frame_text(raw_assertion)
        relation = build_relation(raw_assertion)
        start = build_start(raw_assertion)
        end = build_end(raw_assertion)
        dataset = build_data_set(raw_assertion)
        sources = build_sources(raw_assertion)
        


        edges = []
        for source_list, weight in sources:
            if 'commons2_reject' in ' '.join(source_list):
                weight = -1
            
            if by_bedume_and_bad(source_list,start,end,raw_assertion):
                return []
            else:
                edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight)
                edges.append(edge)

        return edges
    except Exception:
        import traceback
        #traceback.print_exc()
        return []
def handle_raw_flat_assertion(flat_assertion):
    try:
        parts_dict = extract_parts(flat_assertion)

        if can_skip(parts_dict):
            return []

        # build the assertion
        frame_text = build_frame_text(parts_dict)
        relation = build_relation(parts_dict)
        start = build_start(parts_dict)
        end = build_end(parts_dict)
        dataset = build_data_set()
        sources = build_sources(parts_dict)

        edges = []
        for source_list, weight in sources:
            if "commons2_reject" in " ".join(source_list):
                weight = -1

            else:
                edge = make_edge(
                    relation, start, end, dataset, LICENSE, source_list, "/ctx/all", frame_text, weight=weight
                )
                edges.append(edge)

        return edges
    except Exception:
        # import traceback
        # print "failed on a flat_assertion"
        # traceback.print_exc()
        return []
Example #8
0
def subwords_to_edges(language, input, output):
    """
    Morfessor hypothesizes ways to break words into sub-word chunks. Produce
    edges from these sub-words that can be used in retrofitting.
    """
    writer = MsgpackStreamWriter(output)
    for line in input:
        line = line.rstrip()
        if not line or line.startswith('#'):
            continue

        # Remove the unnecessary count ("1 ") from the start of each line
        line = line.split(' ', 1)[1]
        chunks = line.split(' + ')

        # Strip a possible trailing underscore, which would particularly show
        # up in the way we segment ATOMIC_SPACE_LANGUAGES (Vietnamese)
        full_text = ''.join(chunks).strip('_')
        end = join_uri('c', language, full_text)
        for chunk in chunks:
            if chunk != '_':
                start = join_uri('x', language, chunk.strip('_'))
                edge = make_edge(
                    '/r/SubwordOf',
                    start,
                    end,
                    dataset='/d/morphology',
                    license=Licenses.cc_attribution,
                    sources=MORPH_SOURCES,
                    weight=0.01,
                )
                writer.write(edge)
    writer.close()
 def output_sense_translation(self, lang, foreign, german, disambiguation):
     if 'Wik' in foreign or 'Wik' in german:
         return
     if lang == 'zh-cn':
         lang = 'zh_CN'
     elif lang == 'zh-tw':
         lang = 'zh_TW'
     source = make_concept_uri(
       unicodedata.normalize('NFKC', foreign), lang
     )
     target = make_concept_uri(
       german, 'de', disambiguation
     )
     relation = '/r/TranslationOf'
     try:
         surfaceRel = "is %s for" % (langs.english_name(lang))
     except KeyError:
         surfaceRel = "is [language %s] for" % lang
     surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' '))
     #print surfaceText
     edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang,
                      license='/l/CC/By-SA',
                      sources=[SOURCE, TRANSLATE],
                      context='/ctx/all',
                      weight=1.5,
                      surfaceText=surfaceText)
     self.writer.write(edge)
Example #10
0
 def output_sense_translation(self, lang, foreign, english, disambiguation):
     if 'Wik' in foreign or 'Wik' in english:
         return
     if lang == 'zh-cn':
         lang = 'zh_CN'
     elif lang == 'zh-tw':
         lang = 'zh_TW'
     source = make_concept_uri(unicodedata.normalize('NFKC', foreign), lang)
     target = make_concept_uri(english, 'en', disambiguation)
     relation = '/r/TranslationOf'
     try:
         surfaceRel = "is %s for" % (langs.english_name(lang))
     except KeyError:
         surfaceRel = "is [language %s] for" % lang
     surfaceText = "[[%s]] %s [[%s (%s)]]" % (
         foreign, surfaceRel, english,
         disambiguation.split('/')[-1].replace('_', ' '))
     print surfaceText
     edge = make_edge(relation,
                      source,
                      target,
                      '/d/wiktionary/en/%s' % lang,
                      license='/l/CC/By-SA',
                      sources=[SOURCE, TRANSLATE],
                      context='/ctx/all',
                      weight=1.5,
                      surfaceText=surfaceText)
     self.writer.write(edge)
def handle_raw_assertion(flat_assertion):
    try:
        parts_dict = extract_parts(flat_assertion)
        
        if can_skip(parts_dict):
            return []

        # build the assertion
        frame_text = build_frame_text(parts_dict)
        relation = build_relation(parts_dict)
        start = build_start(parts_dict)
        end = build_end(parts_dict)
        dataset = build_data_set(parts_dict)
        sources = build_sources(parts_dict)

        edges = []
        for source_list, weight in sources:
            if 'commons2_reject' in ' '.join(source_list):
                weight = -1
            
            if by_bedume_and_bad(source_list,start,end):
                return []
            else:
                edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight)
                edges.append(edge)

        return edges
    except Exception:
        import traceback
        print "failed on flat_assertion: " + str(flat_assertion)
        traceback.print_exc()
        return []
Example #12
0
def handle_raw_assertion(raw_assertion):
    try:
        if can_skip(raw_assertion):
            return []

        frame_text = build_frame_text(raw_assertion)
        relation = build_relation(raw_assertion)
        start = build_start(raw_assertion)
        end = build_end(raw_assertion)
        dataset = build_data_set()
        sources = build_sources(raw_assertion)

        edges = []
        for source_list, weight in sources:
            if 'commons2_reject' in ' '.join(source_list):
                weight = -1

            edge = make_edge(relation,
                             start,
                             end,
                             dataset,
                             LICENSE,
                             source_list,
                             '/ctx/all',
                             frame_text,
                             weight=weight)
            edges.append(edge)
        return edges
    except Exception:
        import traceback
        #traceback.print_exc()
        return []
Example #13
0
def handle_raw_assertion(raw_assertion):
    line = raw_assertion.strip()
    edges = []
    if line:
        parts = line.split(', ')
        user, frame_id, concept1, concept2 = parts
        frame = Frame.objects.get(id=int(frame_id))
        ftext = frame.text
        relation = frame.relation.name
        rel = '/r/' + relation

        surfaceText = ftext.replace(u'{1}', u'[[' + concept1 + u']]').replace(
            u'{2}', u'[[' + concept2 + u']]')
        start = make_concept_uri(concept1, 'zh_TW')
        end = make_concept_uri(concept2, 'zh_TW')
        sources = ['/s/contributor/petgame/' + user, '/s/activity/ntt/petgame']
        edge = make_edge(rel,
                         start,
                         end,
                         dataset='/d/conceptnet/4/zh',
                         license='/l/CC/By',
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        edges.append(edge)
    return edges
def sum_assertions(file_index):
    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)

    for line in codecs.open(CURRENT_DIR +'/data/temp/core_'+str(file_index)+'.txt', 'r','utf-8'):
        uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9]
        if uri != 'uri' and context == '/ctx/all':
            weight = float(weight)
            weights[uri] += float(weight)
            assertions[uri] = (rel, start, end, context, weights[uri])
            if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')):
                ccby[uri] = True


    writer_core = MultiWriter('assertion_totals_core')
    #writer_sa = MultiWriter('assertion_totals_sa')
    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
        #else:
            #writer_sa.write(edge)
    writer_core.close()
Example #15
0
 def output_sense_translation(self, lang, foreign, english, sense):
     pos, disambiguation = sense
     if 'Wik' in foreign or 'Wik' in english or term_is_bad(foreign) or term_is_bad(english):
         return
     # Quick fix that drops definitions written in Lojban syntax
     if lang == 'jbo' and re.search(r'x[1-5]', english):
         return
     if lang == 'zh-cn':
         lang = 'zh_CN'
     elif lang == 'zh-tw':
         lang = 'zh_TW'
     source = normalized_concept_uri(
       lang, unicodedata.normalize('NFKC', foreign)
     )
     target = normalized_concept_uri(
       'en', english, pos, disambiguation
     )
     relation = '/r/TranslationOf'
     try:
         surfaceRel = "is %s for" % (CODE_TO_ENGLISH_NAME[lang.split('_')[0]])
     except KeyError:
         surfaceRel = "is [language %s] for" % lang
     surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' '))
     edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang,
                      license=Licenses.cc_sharealike,
                      sources=[SOURCE, TRANSLATE],
                      weight=1.0,
                      surfaceText=surfaceText)
     self.writer.write(edge)
Example #16
0
def output_edge(obj,writer):
    objsource = obj['sources'][0]
    if obj['arg1'].startswith(objsource):
        obj['arg1'] = objsource
    if obj['arg2'].startswith(objsource):
        obj['arg2'] = objsource
    if obj['arg1'].endswith(objsource):
        obj['arg1'] = objsource
    if obj['arg2'].endswith(objsource):
        obj['arg2'] = objsource
    start = make_concept_uri(obj['arg1'], 'en')
    end = make_concept_uri(obj['arg2'], 'en')
    if obj['rel'][0] in string.uppercase:
        rel = '/r/'+obj['rel']
    else:
        rel = make_concept_uri(obj['rel'], 'en')
    if start.startswith('/c/en/this_') or start.startswith('/c/en/these_') or end.startswith('/c/en/this_') or end.startswith('/c/en/these_'):
        return
    context = make_concept_uri(objsource, 'en')
    source = "/s/web/en.wikipedia.org/wiki/%s" % (objsource.replace(' ', '_'))
    rules = ['/s/rule/reverb', '/s/rule/reverb_filter_apr2012']
    surfaceText = u"[[%s]] %s [[%s]]" % (obj['arg1'], obj.get('surfaceRel', obj['rel']), obj['arg2'])
    weight = float(obj['weight']) ** 3 / 2
    edge = make_edge(rel, start, end,
                     dataset='/d/reverb/wp_frontpage',
                     license='/l/CC/By-SA',
                     sources=[source] + rules,
                     context=context,
                     surfaceText=surfaceText,
                     weight=weight)
    writer.write(edge)
Example #17
0
 def output_translation(self, foreign, english, locale=''):
     if term_is_bad(foreign) or term_is_bad(english):
         return
     # Quick fix that drops definitions written in Lojban syntax
     if self.langcode == 'jbo' and re.search(r'x[1-5]', english):
         return
     source = normalized_concept_uri(
         self.langcode + locale,
         foreign
     )
     target = normalized_concept_uri(
       'en', english
     )
     relation = '/r/TranslationOf'
     try:
         surfaceRel = "is %s for" % (CODE_TO_ENGLISH_NAME[self.langcode.split('_')[0]])
     except KeyError:
         surfaceRel = "is [language %s] for" % self.langcode
     surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
     edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode,
                      license=Licenses.cc_sharealike,
                      sources=[SOURCE, INTERLINGUAL],
                      weight=1.0,
                      surfaceText=surfaceText)
     self.writer.write(edge)
Example #18
0
def sum_assertions(file_index):
    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)

    for line in codecs.open(CURRENT_DIR +'/data/temp/core_'+str(file_index)+'.txt', 'r','utf-8'):
        uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9]
        if uri != 'uri' and context == '/ctx/all':
            weight = float(weight)
            weights[uri] += float(weight)
            assertions[uri] = (rel, start, end, context, weights[uri])
            if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')):
                ccby[uri] = True


    writer_core = MultiWriter('assertion_totals_core')
    writer_sa = MultiWriter('assertion_totals_sa')
    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
            writer_sa.write(edge)
        else:
            writer_sa.write(edge)
    writer_core.close()
    writer_sa.close()
Example #19
0
 def output_sense_translation(self, lang, foreign, english, sense):
     pos, disambiguation = sense
     if 'Wik' in foreign or 'Wik' in english or term_is_bad(
             foreign) or term_is_bad(english):
         return
     # Quick fix that drops definitions written in Lojban syntax
     if lang == 'jbo' and re.search(r'x[1-5]', english):
         return
     if lang == 'zh-cn':
         lang = 'zh_CN'
     elif lang == 'zh-tw':
         lang = 'zh_TW'
     source = normalized_concept_uri(lang,
                                     unicodedata.normalize('NFKC', foreign))
     target = normalized_concept_uri('en', english, pos, disambiguation)
     relation = '/r/TranslationOf'
     try:
         surfaceRel = "is %s for" % (
             CODE_TO_ENGLISH_NAME[lang.split('_')[0]])
     except KeyError:
         surfaceRel = "is [language %s] for" % lang
     surfaceText = "[[%s]] %s [[%s (%s)]]" % (
         foreign, surfaceRel, english,
         disambiguation.split('/')[-1].replace('_', ' '))
     edge = make_edge(relation,
                      source,
                      target,
                      '/d/wiktionary/en/%s' % lang,
                      license=Licenses.cc_sharealike,
                      sources=[SOURCE, TRANSLATE],
                      weight=1.0,
                      surfaceText=surfaceText)
     self.writer.write(edge)
Example #20
0
    def handle_assertion(self, parts_dict):
        """
        Process one assertion from ConceptNet 4, which appears in the input
        file as a dictionary.

        Use the 'raw' text -- the text that's not yet reduced to a normalized
        form -- so we can run ConceptNet 5's normalization on it instead.

        Each assertion becomes a number of ConceptNet 5 edges, which will
        probably be grouped together into an assertion again.
        """
        if can_skip(parts_dict):
            return

        # fix the result of some process that broke prepositions ages ago
        preposition_fix = False
        if '} around {' in parts_dict['frame_text']:
            for prep in AROUND_PREPOSITIONS:
                if parts_dict['endText'].startswith(prep + ' '):
                    parts_dict['endText'] = parts_dict['endText'][len(prep) + 1 :]
                    replacement = '} %s {' % prep
                    parts_dict['frame_text'] = parts_dict['frame_text'].replace(
                        '} around {', replacement
                    )
                    preposition_fix = True
                    break

        if can_skip(parts_dict):
            return

        # build the assertion
        frame_text = build_frame_text(parts_dict)
        relation = build_relation(parts_dict)
        start = build_start(parts_dict)
        end = build_end(parts_dict)
        dataset = build_data_set(parts_dict)
        weighted_sources = build_sources(parts_dict, preposition_fix)

        if relation in RELATIONS_TO_DROP:
            return

        if relation == '/r/DesireOf':
            # Fix an inconsistently-named relation from GlobalMind
            relation = '/r/Desires'

        for source_dict in weighted_sources:
            if not skip_assertion(source_dict, start, end):
                weight = source_dict.pop('weight')
                yield make_edge(
                    rel=relation,
                    start=start,
                    end=end,
                    dataset=dataset,
                    license=Licenses.cc_attribution,
                    sources=[source_dict],
                    surfaceText=frame_text,
                    # The edge weight is the weight computed by build_sources,
                    # times the multiplier set on this instance
                    weight=weight * self.weight,
                )
Example #21
0
def handle_triple(line):
    items = line.split()
    for i in xrange(3):
        if not (items[i].startswith('<') and items[i].endswith('>')):
            return
        items[i] = items[i][1:-1]
    subj, pred, obj = items[:3]
    if 'foaf/0.1/homepage' in pred or obj == 'work' or '_Feature' in obj or '#Thing' in obj or '__' in subj or '__' in obj or 'List_of' in subj or 'List_of' in obj:
        return
    subj_concept = make_concept_uri(translate_wp_url(subj), 'en')
    obj_concept = make_concept_uri(translate_wp_url(obj), 'en')
    webrel = map_web_relation(pred)
    if webrel is None:
        return
    rel = normalize_uri('/r/' + webrel)

    if (pred, rel) not in sw_map_used:
        sw_map_used.add((pred, rel))
        sw_map.write({'from': pred, 'to': rel})
    if (subj, subj_concept) not in sw_map_used:
        sw_map_used.add((subj, subj_concept))
        sw_map.write({'from': subj, 'to': subj_concept})
    if (obj, obj_concept) not in sw_map_used:
        sw_map_used.add((obj, obj_concept))
        sw_map.write({'from': obj, 'to': obj_concept})

    edge = make_edge(rel,
                     subj_concept,
                     obj_concept,
                     dataset='/d/dbpedia/en',
                     license='/l/CC/By-SA',
                     sources=['/s/dbpedia/3.7'],
                     context='/ctx/all',
                     weight=0.5)
    writer.write(edge)
Example #22
0
 def output_sense_translation(self, lang, foreign, german, disambiguation):
     if "Wik" in foreign or "Wik" in german:
         return
     if lang == "zh-cn":
         lang = "zh_CN"
     elif lang == "zh-tw":
         lang = "zh_TW"
     source = make_concept_uri(unicodedata.normalize("NFKC", foreign), lang)
     target = make_concept_uri(german, "de", disambiguation)
     relation = "/r/TranslationOf"
     try:
         surfaceRel = "is %s for" % (langs.english_name(lang))
     except KeyError:
         surfaceRel = "is [language %s] for" % lang
     surfaceText = "[[%s]] %s [[%s (%s)]]" % (
         foreign,
         surfaceRel,
         english,
         disambiguation.split("/")[-1].replace("_", " "),
     )
     # print surfaceText
     edge = make_edge(
         relation,
         source,
         target,
         "/d/wiktionary/en/%s" % lang,
         license="/l/CC/By-SA",
         sources=[SOURCE, TRANSLATE],
         context="/ctx/all",
         weight=1.5,
         surfaceText=surfaceText,
     )
     self.writer.write(edge)
Example #23
0
def handle_raw_assertion(raw_assertion):
    edges = []
    assertion, users = raw_assertion
    frame_id, concept1, concept2 = assertion
    frame = Frame.objects.get(id=int(frame_id))
    ftext = frame.text
    relation = frame.relation.name
    rel = '/r/' + relation

    surfaceText = ftext.replace(u'{1}', u'[[' + concept1 + u']]').replace(
        u'{2}', u'[[' + concept2 + u']]')
    start = make_concept_uri(concept1, 'zh_TW')
    end = make_concept_uri(concept2, 'zh_TW')
    sources = ['/s/activity/ptt/petgame']
    for user in users:
        sources.append('/s/contributor/petgame/' + user)
    edge = make_edge(rel,
                     start,
                     end,
                     dataset='/d/conceptnet/4/zh',
                     license='/l/CC/By',
                     sources=sources,
                     surfaceText=surfaceText,
                     weight=len(users))
    edges.append(edge)
    return edges
Example #24
0
 def output_sense_translation(self, lang, foreign, translated, disambiguation):
     if u':' in foreign or u':' in translated:
         return
     if lang == 'zh-cn':
         lang = 'zh_CN'
     elif lang == 'zh-tw':
         lang = 'zh_TW'
     source = make_concept_uri_safe(
       unicodedata.normalize('NFKC', foreign), lang
     )
     target = make_concept_uri_safe(
       translated, self.langcode, disambiguation
     )
     relation = '/r/TranslationOf'
     try:
         surfaceRel = "is %s for" % (langs.english_name(lang))
     except KeyError:
         surfaceRel = "is [language %s] for" % lang
     if disambiguation and '/' in disambiguation:
         surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, translated, disambiguation.split('/')[-1].replace('_', ' '))
     else:
         surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, translated)
     #print surfaceText
     edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % (self.langcode),
                      license='/l/CC/By-SA',
                      sources=[SOURCE, TRANSLATE],
                      context='/ctx/all',
                      weight=1.0,
                      surfaceText=surfaceText)
     self.writer.write(edge)
Example #25
0
    def handle_assertion(self, parts_dict):
        """
        Process one assertion from ConceptNet 4, which appears in the input
        file as a dictionary.

        Use the 'raw' text -- the text that's not yet reduced to a normalized
        form -- so we can run ConceptNet 5's normalization on it instead.

        Each assertion becomes a number of ConceptNet 5 edges, which will
        probably be grouped together into an assertion again.
        """
        if can_skip(parts_dict):
            return

        # fix the result of some process that broke prepositions ages ago
        preposition_fix = False
        if '} around {' in parts_dict['frame_text']:
            for prep in AROUND_PREPOSITIONS:
                if parts_dict['endText'].startswith(prep + ' '):
                    parts_dict['endText'] = \
                        parts_dict['endText'][len(prep) + 1:]
                    replacement = '} %s {' % prep
                    parts_dict['frame_text'] = \
                        parts_dict['frame_text'].replace(
                            '} around {',
                            replacement
                        )
                    preposition_fix = True
                    break

        if can_skip(parts_dict):
            return

        # build the assertion
        frame_text = build_frame_text(parts_dict)
        relation = build_relation(parts_dict)
        start = build_start(parts_dict)
        end = build_end(parts_dict)
        dataset = build_data_set(parts_dict)
        weighted_sources = build_sources(parts_dict, preposition_fix)

        if relation in RELATIONS_TO_DROP:
            return

        if relation == '/r/DesireOf':
            # Fix an inconsistently-named relation from GlobalMind
            relation = '/r/Desires'

        for source_dict in weighted_sources:
            if not skip_assertion(source_dict, start, end):
                weight = source_dict.pop('weight')
                yield make_edge(
                    rel=relation, start=start, end=end,
                    dataset=dataset, license=Licenses.cc_attribution,
                    sources=[source_dict], surfaceText=frame_text,

                    # The edge weight is the weight computed by build_sources,
                    # times the multiplier set on this instance
                    weight=weight * self.weight
                )
Example #26
0
def external_url_edge(start, end):
    return make_edge(
        rel='/r/ExternalURL', start=start, end=end,
        dataset='/d/opencyc',
        license=Licenses.cc_attribution,
        sources=[SOURCE],
        weight=1.0
    )
Example #27
0
def external_url_edge(start, end):
    return make_edge(rel='/r/ExternalURL',
                     start=start,
                     end=end,
                     dataset='/d/opencyc',
                     license=Licenses.cc_attribution,
                     sources=[SOURCE],
                     weight=1.0)
Example #28
0
def output_edge(outfile, subj_concept, obj_concept):
    rel = '/r/TranslationOf'
    edge = make_edge(rel, subj_concept, obj_concept,
                     dataset='/d/jmdict',
                     license='/l/CC/By-SA',
                     sources=['/s/jmdict/1.07'],
                     context='/ctx/all',
                     weight=0.5)
    print >> outfile, json.dumps(edge, ensure_ascii=False)
Example #29
0
def output_assertion(out, **kwargs):
    """
    Output an assertion to the given output stream. All keyword arguments
    become arguments to `make_edge`. (An assertion is a kind of edge.)
    """
    # Build the assertion object.
    assertion = make_edge(**kwargs)

    # Output the result in a Msgpack stream.
    out.write(assertion)
Example #30
0
def output_edge(out, rel, subj_concept, obj_concept):
    """
    Write an edge to `out`, an instance of MsgpackStreamWriter.
    """
    edge = make_edge(rel, subj_concept, obj_concept,
                     dataset='/d/jmdict',
                     license=Licenses.cc_sharealike,
                     sources=[{'contributor': '/s/resource/jmdict/1.07'}],
                     weight=2.0)
    out.write(edge)
Example #31
0
def handle_raw_assertion(raw, writer):
    try:
        lang = raw.language_id
        assert lang == 'ja'
        if raw.frame.goodness < 1: return
        polarity = raw.frame.frequency.value
        activity = raw.sentence.activity.name
        if 'rubycommons' in activity: return

        # build the assertion
        frame_text = raw.frame.text
        frame_text = frame_text.replace('{1}', '[[%s]]' % raw.text1).replace(
            '{2}', '[[%s]]' % raw.text2)

        activity_node = normalize_uri(u'/s/site/nadya.jp')

        startText = ' '.join(JA.normalize_list(raw.text1))
        endText = ' '.join(JA.normalize_list(raw.text2))
        if startText != raw.text1:
            print raw.text1.encode('utf-8'), '=>', startText.encode('utf-8')
        normalize_uri('/text/' + lang + '/' + startText)
        end = normalize_uri('/text/' + lang + '/' + endText)

        relname = raw.frame.relation.name
        if relname == 'ConceptuallyRelatedTo':
            relname = 'RelatedTo'

        if polarity > 0:
            relation = normalize_uri('/r/' + relname)
        else:
            relation = normalize_uri('/r/Not' + relname)

        dataset = normalize_uri('/d/nadya.jp')
        score = raw.score

        sources = [([activity_node], score / 5.)]

        for source_list, weight in sources:
            if 'commons2_reject' in ' '.join(source_list):
                weight = -1
            start = make_concept_uri(startText, lang)
            end = make_concept_uri(endText, lang)
            edge = make_edge(relation,
                             start,
                             end,
                             dataset,
                             LICENSE,
                             source_list,
                             '/ctx/all',
                             frame_text,
                             weight=weight)
            writer.write(edge)
    except Exception:
        import traceback
        traceback.print_exc()
Example #32
0
def output_edge(out, subj_concept, obj_concept):
    """
    Write an edge to `out`, an instance of MsgpackStreamWriter.
    """
    rel = '/r/TranslationOf'
    edge = make_edge(rel, subj_concept, obj_concept,
                     dataset='/d/jmdict',
                     license=Licenses.cc_sharealike,
                     sources=['/s/jmdict/1.07'],
                     weight=0.5)
    out.write(edge)
Example #33
0
def output_assertion(out, **kwargs):
    uri = kwargs.pop('uri')
    source_tree = make_disjunction_uri(set(kwargs.pop('sources')))
    assertion = make_edge(sources=source_tree, **kwargs)
    current_weight = assertion['weight']
    log_weight = math.log(max(1, current_weight + 1)) / math.log(2)
    assertion['weight'] = log_weight
    
    assert assertion['uri'] == uri, (assertion['uri'], uri)
    line = json.dumps(assertion, ensure_ascii=False)
    print >> out, line
def output_assertion(out, **kwargs):
    uri = kwargs.pop('uri')
    source_tree = make_disjunction_uri(set(kwargs.pop('sources')))
    assertion = make_edge(sources=source_tree, **kwargs)
    current_weight = assertion['weight']
    log_weight = math.log(max(1, current_weight + 1)) / math.log(2)
    assertion['weight'] = log_weight

    assert assertion['uri'] == uri, (assertion['uri'], uri)
    line = json.dumps(assertion, ensure_ascii=False)
    print >> out, line
Example #35
0
def output_edge(outfile, subj_concept, obj_concept):
    rel = '/r/TranslationOf'
    edge = make_edge(rel,
                     subj_concept,
                     obj_concept,
                     dataset='/d/jmdict',
                     license='/l/CC/By-SA',
                     sources=['/s/jmdict/1.07'],
                     context='/ctx/all',
                     weight=0.5)
    print >> outfile, json.dumps(edge, ensure_ascii=False)
Example #36
0
def umbel_edge(rel, start, end, surface, source):
    """
    Get the ConceptNet representation of an UMBEL edge.
    """
    return make_edge(
        rel=rel, start=start, end=end,
        dataset='/d/umbel',
        license=Licenses.cc_attribution,
        sources=[source],
        weight=1.0,
        surfaceText=surface
    )
Example #37
0
def umbel_edge(rel, start, end, surface, source):
    """
    Get the ConceptNet representation of an UMBEL edge.
    """
    return make_edge(rel=rel,
                     start=start,
                     end=end,
                     dataset='/d/umbel',
                     license=Licenses.cc_attribution,
                     sources=[source],
                     weight=1.0,
                     surfaceText=surface)
Example #38
0
def handle_file(input_file, output_file):
    tree = ET.parse(input_file)
    out = MsgpackStreamWriter(output_file)
    root = tree.getroot()
    lang = root[0][1].attrib['type']
    for annotation in root[1]:

        for word in strip_words(annotation.text):
            start = standardized_concept_uri('mul', annotation.attrib['cp'])
            end = standardized_concept_uri(lang, word)
            edge = make_edge(REL, start, end, DATASET, LICENSE, SOURCE)
            out.write(edge)
Example #39
0
    def handle_assertion(self, parts_dict):
        """
        Process one assertion from ConceptNet 4, which appears in the input
        file as a dictionary.

        Use the 'raw' text -- the text that's not yet reduced to a normalized
        form -- so we can run ConceptNet 5's normalization on it instead.

        Each assertion becomes a number of ConceptNet 5 edges, which will
        probably be grouped together into an assertion again.
        """
        if can_skip(parts_dict):
            return

        # fix the result of some process that broke prepositions ages ago
        preposition_fix = False
        if '} around {' in parts_dict['frame_text']:
            for prep in AROUND_PREPOSITIONS:
                if parts_dict['endText'].startswith(prep + ' '):
                    parts_dict['endText'] = \
                        parts_dict['endText'][len(prep) + 1:]
                    replacement = '} %s {' % prep
                    parts_dict['frame_text'] = \
                        parts_dict['frame_text'].replace(
                            '} around {',
                            replacement
                        )
                    preposition_fix = True
                    break

        if can_skip(parts_dict):
            return

        # build the assertion
        frame_text = build_frame_text(parts_dict)
        relation = build_relation(parts_dict)
        start = build_start(parts_dict)
        end = build_end(parts_dict)
        dataset = build_data_set(parts_dict)
        weighted_sources = build_sources(parts_dict, preposition_fix)

        for source_list, weight in weighted_sources:
            if 'commons2_reject' in ' '.join(source_list):
                return

        for source_list, weight in weighted_sources:
            if not by_bedume_and_bad(source_list, start, end):
                yield make_edge(
                    rel=relation, start=start, end=end,
                    dataset=dataset, license=Licenses.cc_attribution,
                    sources=source_list, surfaceText=frame_text,
                    weight=weight
                )
Example #40
0
def handle_triple(line, reader, out, map_out):
    subj, pred, obj, tag = reader.parse_line(line)
    if tag != 'URL':
        return

    # Ignore types of edges that we don't care about:
    #   - Homepage links
    #   - GIS features
    #   - Assertions that something "is a thing"
    #   - Anonymous nodes identified with double-underscores, such as the node
    #     "Alfred_Nobel__1", which means "Alfred Nobel's occupation, whatever
    #     it is"
    #   - Nodes that are articles named "List of X" on Wikipedia
    if ('foaf/0.1/homepage' in pred or '_Feature' in obj or '#Thing' in obj or
        '__' in subj or '__' in obj or 'List_of' in subj or 'List_of' in obj):
        return

    # We don't try to parse URIs from outside of dbpedia.org's namespace.
    if 'dbpedia.org' not in obj:
        return

    subj_concept = translate_dbpedia_url(subj, 'en')
    obj_concept = translate_dbpedia_url(obj, 'en')

    # DBPedia categorizes a lot of things as 'works', which causes unnecessary
    # ambiguity. Disregard these edges; there will almost always be a more
    # specific edge calling it a 'creative work' anyway.
    if obj_concept == '/c/en/work':
        return

    rel = map_dbpedia_relation(pred)
    if rel is None:
        return

    # We've successfully converted this Semantic Web triple to ConceptNet URIs.
    # Now write the results to the 'sw_map' file so others can follow this
    # mapping.
    mapped_pairs = [
        (pred, rel),
        (subj, subj_concept),
        (obj, obj_concept)
    ]
    for sw_url, conceptnet_uri in mapped_pairs:
        conceptnet_url = full_conceptnet_url(conceptnet_uri)
        map_out.write_link(conceptnet_url, sw_url)

    edge = make_edge(rel, subj_concept, obj_concept,
                     dataset='/d/dbpedia/en',
                     license=Licenses.cc_sharealike,
                     sources=['/s/dbpedia/3.7'],
                     weight=0.5)

    out.write(edge)
Example #41
0
def opencyc_edge(rel, start, end, start_text, end_text):
    """
    Get the ConceptNet representation of an OpenCyc edge.
    """
    return make_edge(
        rel=rel, start=start, end=end,
        dataset='/d/opencyc',
        license=Licenses.cc_attribution,
        sources=[SOURCE],
        weight=1.0,
        surfaceStart=start_text,
        surfaceEnd=end_text
    )
Example #42
0
def opencyc_edge(rel, start, end, start_text, end_text):
    """
    Get the ConceptNet representation of an OpenCyc edge.
    """
    return make_edge(rel=rel,
                     start=start,
                     end=end,
                     dataset='/d/opencyc',
                     license=Licenses.cc_attribution,
                     sources=[SOURCE],
                     weight=1.0,
                     surfaceStart=start_text,
                     surfaceEnd=end_text)
Example #43
0
def output_edge(out, subj_concept, obj_concept):
    """
    Write an edge to `out`, an instance of MsgpackStreamWriter.
    """
    rel = '/r/TranslationOf'
    edge = make_edge(rel,
                     subj_concept,
                     obj_concept,
                     dataset='/d/jmdict',
                     license=Licenses.cc_sharealike,
                     sources=['/s/jmdict/1.07'],
                     weight=0.5)
    out.write(edge)
Example #44
0
    def handle_raw_assertion(self, flat_assertion):
        parts_dict = json.loads(flat_assertion)
        
        if can_skip(parts_dict):
            return

        # fix the result of some process that broke prepositions ages ago
        preposition_fix = False
        if '} around {' in parts_dict['frame_text']:
            for prep in AROUND_PREPOSITIONS:
                if parts_dict['endText'].startswith(prep + ' '):
                    parts_dict['endText'] = \
                        parts_dict['endText'][len(prep) + 1:]
                    replacement = '} %s {' % prep
                    parts_dict['frame_text'] = \
                        parts_dict['frame_text'].replace(
                            '} around {',
                            replacement
                        )
                    preposition_fix = True
                    break
            
        # build the assertion
        frame_text = build_frame_text(parts_dict)
        relation = build_relation(parts_dict)
        start = build_start(parts_dict)
        end = build_end(parts_dict)
        dataset = build_data_set(parts_dict)
        sources = build_sources(parts_dict, preposition_fix)

        reject = False
        for source_list, weight in sources:
            if 'commons2_reject' in ' '.join(source_list):
                reject = True

        if not reject:
            for source_list, weight in sources:
                if not by_bedume_and_bad(source_list,start,end):
                    contributors = [s for s in source_list if s.startswith('/s/contributor')]
                    assert len(contributors) <= 1, contributors
                    edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight)
                    okay = True
                    if contributors:
                        uri = edge['uri']
                        contributor = contributors[0]
                        if (uri, contributor) in self.seen_sources:
                            okay = False
                        else:
                            self.seen_sources.add((uri, contributor))
                    if okay:
                        yield json.dumps(edge, ensure_ascii=False)
def _make_assertion(line_group):
    """
    When a generator of tab-separated lines has been grouped by their assertion
    URI, this function takes all the lines with the same URI and makes a single
    assertion out of them.
    """
    lines = [line.rstrip() for line in line_group]
    lines = [line for line in lines if line]
    if not lines:
        return None

    # FIXME: the steps leading up to this produce URIs that can differ based
    # on word senses. These don't get merged together, but they should.
    uri, rel, start, end, _ = lines[0].split('\t')

    if not (keep_concept(start) and keep_concept(end)):
        return None

    info_dicts = [json.loads(line.split('\t')[4]) for line in lines]
    unscaled_weight = sum(info['weight'] for info in info_dicts)
    licenses = {info['license'] for info in info_dicts}
    dataset = info_dicts[0]['dataset']
    surface_text = None
    sources = []
    seen_sources = set()
    for info in info_dicts:
        if surface_text is None and 'surfaceText' in info:
            surface_text = info['surfaceText']
        for subsource in info['sources']:
            conjunction = conjunction_uri(*sorted(subsource.values()))
            if conjunction not in seen_sources:
                sources.append(subsource)
                seen_sources.add(conjunction)

    weight = weight_scale(unscaled_weight)
    if Licenses.cc_sharealike in licenses:
        license = Licenses.cc_sharealike
    else:
        license = Licenses.cc_attribution

    return make_edge(
        rel=rel,
        start=start,
        end=end,
        weight=weight,
        dataset=dataset,
        license=license,
        sources=sources,
        surfaceText=surface_text,
    )
Example #46
0
def handle_raw_assertion(line):
    parts = line.split(', ')
    user, frame_id, concept1, concept2 = parts
    fdata = FRAME_DATA[frame_id]
    ftext = fdata['text']
    rel = fdata['relation']

    surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace('{2}', '[[' + concept2 + ']]')
    start = normalized_concept_uri('zh_TW', concept1)
    end = normalized_concept_uri('zh_TW', concept2)
    sources = ['/s/activity/ptt/petgame', '/s/contributor/petgame/' + user]
    yield make_edge(rel, start, end, dataset='/d/conceptnet/4/zh',
                    license='/l/CC/By', sources=sources,
                    surfaceText=surfaceText, weight=1)
Example #47
0
def output_edge(out, subj_concept, obj_concept):
    """
    Write an edge to `out`, an instance of JSONFileWriter.
    """
    rel = "/r/TranslationOf"
    edge = make_edge(
        rel,
        subj_concept,
        obj_concept,
        dataset="/d/jmdict",
        license=Licenses.cc_sharealike,
        sources=["/s/jmdict/1.07"],
        weight=0.5,
    )
    out.write(edge)
def make_assertion(line_group):
    lines = [line.rstrip() for line in line_group]
    lines = [line for line in lines if line]
    if not lines:
        return None

    # FIXME: the steps leading up to this produce URIs that can differ based
    # on word senses. These don't get merged together, but they should.
    uri, rel, start, end, _ = lines[0].split('\t')

    # We can't distinguish word senses well enough yet, so only keep them
    # up to the part of speech
    start = uri_prefix(start, 4)
    end = uri_prefix(end, 4)

    if not (keep_concept(start) and keep_concept(end)):
        return None

    info_dicts = [json.loads(line.split('\t')[4]) for line in lines]
    unscaled_weight = sum(info['weight'] for info in info_dicts)
    licenses = {info['license'] for info in info_dicts}
    dataset = info_dicts[0]['dataset']
    surface_text = None
    sources = []
    seen_sources = set()
    for info in info_dicts:
        if surface_text is None and 'surfaceText' in info:
            surface_text = info['surfaceText']
        for subsource in info['sources']:
            conjunction = conjunction_uri(*sorted(subsource.values()))
            if conjunction not in seen_sources:
                sources.append(subsource)
                seen_sources.add(conjunction)

    weight = weight_scale(unscaled_weight)
    if Licenses.cc_sharealike in licenses:
        license = Licenses.cc_sharealike
    else:
        license = Licenses.cc_attribution

    return make_edge(rel=rel,
                     start=start,
                     end=end,
                     weight=weight,
                     dataset=dataset,
                     license=license,
                     sources=sources,
                     surfaceText=surface_text)
Example #49
0
    def output_monolingual(self, lang, relation, term1, term2):
        if term_is_bad(term1) or term_is_bad(term2):
            return
        source = normalized_concept_uri(lang, term1)
        if self.pos:
            target = normalized_concept_uri(lang, term2, self.pos)
        else:
            target = normalized_concept_uri(lang, term2)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)

        edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang),
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE, MONOLINGUAL],
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Example #50
0
def handle_raw_assertion(line):
    if not line:
        return
    parts = line.split(', ')
    user, frame_id, concept1, concept2 = parts
    fdata = FRAME_DATA[frame_id]
    ftext = fdata['text']
    rel = fdata['relation']

    surfaceText = ftext.replace(u'{1}', u'[['+concept1+u']]').replace(u'{2}', u'[['+concept2+u']]')
    start = make_concept_uri(concept1, 'zh_TW')
    end = make_concept_uri(concept2, 'zh_TW')
    sources = ['/s/activity/ptt/petgame', '/s/contributor/petgame/' + user]
    edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh',
                     license='/l/CC/By', sources=sources,
                     surfaceText=surfaceText, weight=1)
    yield json.dumps(edge, ensure_ascii=False)
Example #51
0
def handle_file(input_filename, output_file):
    out = MsgpackStreamWriter(output_file)
    for line in open(input_filename, encoding='utf-8'):
        parts = line.rstrip('\n').split('\t')
        uri, start, rel, end, weight, source = parts
        if uri == 'uri':
            return

        edge = make_edge(rel=rel,
                         start=start,
                         end=end,
                         dataset=DATASET,
                         sources=[{
                             'activity': SOURCE
                         }],
                         license=Licenses.cc_attribution,
                         weight=WEIGHT_TABLE[weight])
        out.write(edge)
    def output_monolingual(self, lang, relation, term1, term2):
        if 'Wik' in term1 or 'Wik' in term2:
            return
        source = make_concept_uri(term1, lang)
        if self.pos:
            target = make_concept_uri(term2, lang, self.pos)
        else:
            target = make_concept_uri(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        #print surfaceText

        edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Example #53
0
def build_core_from_csvs(csv_files):

    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)

    for csv_file in csv_files:
        print "currently in file: " + str(csv_file)
        for line in codecs.open(csv_file, encoding='utf-8'):
            uri, rel, start, end, context, weight, sources, id, dataset = line.split(
                '\t')[:9]
            if uri != 'uri' and context == '/ctx/all':
                weight = float(weight)
                weights[uri] += float(weight)
                assertions[uri] = (rel, start, end, context, weights[uri])
                if not (dataset.startswith('/d/reverb')
                        or dataset.startswith('/d/wiktionary')
                        or dataset.startswith('/d/dbpedia')):
                    ccby[uri] = True

    print 'writing'
    writer_core = MultiWriter('assertion_totals_core')
    #writer_sa = MultiWriter('assertion_totals_sa')

    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation,
                         start,
                         end,
                         dataset,
                         license, ['/s/rule/sum_edges'],
                         '/ctx/all',
                         weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
        #else:
        #writer_sa.write(edge)
    writer_core.close()
Example #54
0
def handle_file(input_file, output_file):
    tree = ET.parse(input_file)
    out = MsgpackStreamWriter(output_file)
    root = tree.getroot()
    lang = root[0][1].attrib[
        'type'
    ]  # language is at position [1] within the child node [0]

    if len(root) >= 2:
        for annotation in root[1]:
            for word in strip_words(annotation.text):
                start = standardized_concept_uri('mul', annotation.attrib['cp'])
                end = standardized_concept_uri(lang, word)
                edge = make_edge(REL, start, end, DATASET, LICENSE, SOURCE)
                out.write(edge)
    else:
        print("No emoji data in {!r}".format(input_file))

    out.close()
Example #55
0
    def output_monolingual(self, lang, relation, term1, term2):
        if term_is_bad(term1) or term_is_bad(term2):
            return
        source = normalized_concept_uri(lang, term1)
        if self.pos:
            target = normalized_concept_uri(lang, term2, self.pos)
        else:
            target = normalized_concept_uri(lang, term2)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)

        edge = make_edge('/r/' + relation,
                         source,
                         target,
                         '/d/wiktionary/%s/%s' % (lang, lang),
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE, MONOLINGUAL],
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)