Example #1
0
def run_single_process():
    writer = MultiWriter('conceptnet4_nadya')
    raw_assertions = RawAssertion.objects.filter()
    for raw_assertion in raw_assertions:
        edges = handle_raw_assertion(raw_assertion)
        for edge in edges:
            writer.write(edge)
def run_single_process():
    writer = MultiWriter('conceptnet4_nadya')
    raw_assertions = RawAssertion.objects.filter()
    for raw_assertion in raw_assertions:
        edges = handle_raw_assertion(raw_assertion)
        for edge in edges:
            writer.write(edge)
def sum_assertions(file_index):
    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)

    for line in codecs.open(CURRENT_DIR +'/data/temp/core_'+str(file_index)+'.txt', 'r','utf-8'):
        uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9]
        if uri != 'uri' and context == '/ctx/all':
            weight = float(weight)
            weights[uri] += float(weight)
            assertions[uri] = (rel, start, end, context, weights[uri])
            if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')):
                ccby[uri] = True


    writer_core = MultiWriter('assertion_totals_core')
    #writer_sa = MultiWriter('assertion_totals_sa')
    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
        #else:
            #writer_sa.write(edge)
    writer_core.close()
def run_single_process():
    writer = MultiWriter("conceptnet4_nadya")
    path = "./raw_data/"
    for filename in os.listdir(path):
        for raw_assertion in codecs.open(path + filename, encoding="utf-8", errors="replace"):
            edges = handle_raw_flat_assertion(raw_assertion)
            for edge in edges:
                writer.write(edge)
def run_single_process():
    writer = MultiWriter('conceptnet4')
    path = "./raw_data/"
    for filename in os.listdir(path):
        for raw_assertion in codecs.open(path + filename, encoding='utf-8', errors='replace'):
            edges = handle_raw_assertion(raw_assertion)
            for edge in edges:
                writer.write(edge)
Example #6
0
def run_single_process():
    writer = MultiWriter('conceptnet4_zh')
    path = "./raw_data/"
    for filename in os.listdir(path):
        for line in codecs.open(path + filename, encoding='utf-8', errors='replace'):
            aggregate_assertion(line)
    for assertion, users in assertion_map.items():
        edges = handle_raw_assertion((assertion, users))
        for edge in edges:
            writer.write(edge)
Example #7
0
def run_single_process():
    writer = MultiWriter('conceptnet4')
    path = "./raw_data/"
    for filename in os.listdir(path):
        for raw_assertion in codecs.open(path + filename,
                                         encoding='utf-8',
                                         errors='replace'):
            edges = handle_raw_assertion(raw_assertion)
            for edge in edges:
                writer.write(edge)
Example #8
0
 def __init__(self):
     self.lang = None
     self.langcode = None
     self.inArticle = False
     self.inTitle = False
     self.curSense = None
     self.curTitle = ''
     self.curText = ''
     self.locales = []
     self.curRelation = None
     self.writer = MultiWriter('wiktionary')
Example #9
0
def run_single_process():
    writer = MultiWriter('conceptnet4_zh')
    path = "./raw_data/"
    for filename in os.listdir(path):
        for line in codecs.open(path + filename,
                                encoding='utf-8',
                                errors='replace'):
            aggregate_assertion(line)
    for assertion, users in assertion_map.items():
        edges = handle_raw_assertion((assertion, users))
        for edge in edges:
            writer.write(edge)
 def __init__(self):
     self.lang = None
     self.langcode = None
     self.inArticle = False
     self.inTitle = False
     self.curSense = None
     self.curTitle = ''
     self.curText = ''
     self.locales = []
     self.curRelation = None
     self.writer = MultiWriter('wiktionary_ja')
     self.nosensetrans = None # non-sense-specific translation
Example #11
0
 def create_processes(self):
     processes = []
     for i in range(self.num_threads):
         writer = MultiWriter(self.writer_name + "_" + str(i))
         p = Process(target=self.pull_lines, args=(self.queue, writer))
         p.daemon = True
         p.start()
         processes.append(p)
     return processes
Example #12
0
 def __init__(self):
     self.lang = None
     self.langcode = None
     self.inArticle = False
     self.inTitle = False
     self.curSense = None
     self.curTitle = ''
     self.curText = ''
     self.locales = []
     self.curRelation = None
     self.writer = MultiWriter('wiktionary')
Example #13
0
 def __init__(self):
     self.lang = None
     self.langcode = None
     self.inArticle = False
     self.inTitle = False
     self.curSense = None
     self.curTitle = ""
     self.curText = ""
     self.locales = []
     self.curRelation = None
     self.writer = MultiWriter("wiktionary")
     self.trans = False  # in translation mode
Example #14
0
def build_core_from_csvs(csv_files):

    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)


    for csv_file in csv_files:
        print "currently in file: " + str(csv_file)
        for line in codecs.open(csv_file, encoding='utf-8'):
            uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9]
            if uri != 'uri' and context == '/ctx/all':
                weight = float(weight)
                weights[uri] += float(weight)
                assertions[uri] = (rel, start, end, context, weights[uri])
                if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')):
                    ccby[uri] = True

    print 'writing'
    writer_core = MultiWriter('assertion_totals_core')
    #writer_sa = MultiWriter('assertion_totals_sa')

    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
        #else:
            #writer_sa.write(edge)
    writer_core.close()
Example #15
0
def build_core_from_csvs(csv_files):

    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)

    for csv_file in csv_files:
        print "currently in file: " + str(csv_file)
        for line in codecs.open(csv_file, encoding='utf-8'):
            uri, rel, start, end, context, weight, sources, id, dataset = line.split(
                '\t')[:9]
            if uri != 'uri' and context == '/ctx/all':
                weight = float(weight)
                weights[uri] += float(weight)
                assertions[uri] = (rel, start, end, context, weights[uri])
                if not (dataset.startswith('/d/reverb')
                        or dataset.startswith('/d/wiktionary')
                        or dataset.startswith('/d/dbpedia')):
                    ccby[uri] = True

    print 'writing'
    writer_core = MultiWriter('assertion_totals_core')
    #writer_sa = MultiWriter('assertion_totals_sa')

    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation,
                         start,
                         end,
                         dataset,
                         license, ['/s/rule/sum_edges'],
                         '/ctx/all',
                         weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
        #else:
        #writer_sa.write(edge)
    writer_core.close()
Example #16
0
def sum_assertions(file_index):
    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)

    for line in codecs.open(
            CURRENT_DIR + '/data/temp/core_' + str(file_index) + '.txt', 'r',
            'utf-8'):
        uri, rel, start, end, context, weight, sources, id, dataset = line.split(
            '\t')[:9]
        if uri != 'uri' and context == '/ctx/all':
            weight = float(weight)
            weights[uri] += float(weight)
            assertions[uri] = (rel, start, end, context, weights[uri])
            if not (dataset.startswith('/d/reverb')
                    or dataset.startswith('/d/wiktionary')
                    or dataset.startswith('/d/dbpedia')):
                ccby[uri] = True

    writer_core = MultiWriter('assertion_totals_core')
    #writer_sa = MultiWriter('assertion_totals_sa')
    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation,
                         start,
                         end,
                         dataset,
                         license, ['/s/rule/sum_edges'],
                         '/ctx/all',
                         weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
        #else:
        #writer_sa.write(edge)
    writer_core.close()
Example #17
0
    #            disambig = sense_name
    #            break
    #    if disambig is None:
    #        disambig = glossary[synset]
    #if disambig is None:
    #    disambig = '*'
    node = make_concept_uri(synset_name, 'en', pos+'/'+disambig)
    if synset not in mapping:
        mapping[synset] = node

# Map senses to the same nodes.
for sense, synset in sense_synsets.items():
    mapping[sense] = mapping[synset]

sources = ['/s/wordnet/3.0']
writer = MultiWriter('wordnet3')
sw_map = FlatEdgeWriter('data/sw/wordnet30.map.json')
sw_map_used = set()

for line in chain(
    open('raw_data/wordnet-attribute.ttl'),
    open('raw_data/wordnet-causes.ttl'),
    open('raw_data/wordnet-classifiedby.ttl'),
    open('raw_data/wordnet-entailment.ttl'),
    open('raw_data/wordnet-hyponym.ttl'),
    open('raw_data/wordnet-instances.ttl'),
    open('raw_data/wordnet-membermeronym.ttl'),
    open('raw_data/wordnet-partmeronym.ttl'),
    open('raw_data/wordnet-sameverbgroupas.ttl'),
    open('raw_data/wordnet-similarity.ttl'),
    open('raw_data/wordnet-substancemeronym.ttl'),
Example #18
0
        relname = raw.frame.relation.name
        if relname == 'ConceptuallyRelatedTo':
            relname = 'RelatedTo'

        if polarity > 0:
            relation = normalize_uri('/r/'+relname)
        else:
            relation = normalize_uri('/r/Not'+relname)

        dataset = normalize_uri('/d/nadya.jp')
        score = raw.score

        sources = [([activity_node], score/5.)]

        for source_list, weight in sources:
            if 'commons2_reject' in ' '.join(source_list):
                weight = -1
            start = make_concept_uri(startText, lang)
            end = make_concept_uri(endText, lang)
            edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight)
            writer.write(edge)
    except Exception:
        import traceback
        traceback.print_exc()

if __name__ == '__main__':
    writer = MultiWriter('nadya.jp')
    queryset_foreach(RawAssertion.objects.filter(), lambda item: handle_raw_assertion(item, writer))
    writer.close()
Example #19
0
class FindTranslations(ContentHandler):
    def __init__(self):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ""
        self.curText = ""
        self.locales = []
        self.curRelation = None
        self.writer = MultiWriter("wiktionary")
        self.trans = False  # in translation mode

    def startElement(self, name, attrs):
        if name == "page":
            self.inArticle = True
            self.curText = []
        elif name == "title":
            self.inTitle = True
            self.curTitle = ""

    def endElement(self, name):
        if name == "page":
            self.inArticle = False
            self.handleArticle(self.curTitle, "".join(self.curText))
        elif name == "title":
            self.inTitle = False

    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split("\n")
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_bottom_match = TRANS_BOTTOM.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)

        ### Get translation
        if trans_top_match:  # start translation part
            self.trans = True
        if self.trans and trans_bottom_match:  # end translation part
            self.trans = False
        if self.trans and line.startswith("*{{"):  # get translation
            lang = line[3:5]  # get language of translation
            # find all translations of that language
            translations = re.findall(u"\{\{Ü.*?\|.*?\|(.*?)\}\}", line)
            for translation in translations:  # iterate over translations
                self.output_sense_translation(lang, translation, title, "")

        ### Get relation
        if line.startswith("{{Synonyme}}"):  # synonym
            self.curRelation = "synonym"
        elif line.startswith(u"{{Gegenwörter}}"):  # antonym
            self.curRelation = "antonym"
        elif line.startswith("{{Oberbegriffe}}"):  # hypernym
            self.curRelation = "hypernym"
        elif line.startswith("{{Unterbegriffe}}"):  # hyponym
            self.curRelation = "hyponym"
        elif line.startswith("{{Redewendungen}}"):  # idiom
            self.curRelation = "idiom"
        elif line.startswith("{{Charakteristische Wortkombinationen}}"):
        # word combination
            self.curRelation = "word combination"
        elif line.startswith("{{Wortbildungen}}"):  # morphology
            self.curRelation = "morphology"
        if self.curRelation and line == "":  # end relation
            self.curRelation = None
        if self.curRelation:
            related_words_or_phrases = re.findall(r"\[\[(.*?)\]\]", line)
            for related_word in related_words_or_phrases:
                self.output_monolingual("deu", self.curRelation, related_word, title)

    def output_monolingual(self, lang, relation, term1, term2):
        if "Wik" in term1 or "Wik" in term2:
            return
        source = make_concept_uri(term1, lang)
        if self.pos:
            target = make_concept_uri(term2, lang, self.pos)
        else:
            target = make_concept_uri(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        # print surfaceText

        edge = make_edge(
            "/r/" + relation,
            source,
            target,
            "/d/wiktionary/%s/%s" % (lang, lang),
            license="/l/CC/By-SA",
            sources=[SOURCE, MONOLINGUAL],
            context="/ctx/all",
            weight=1.5,
            surfaceText=surfaceText,
        )
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, german, disambiguation):
        if "Wik" in foreign or "Wik" in german:
            return
        if lang == "zh-cn":
            lang = "zh_CN"
        elif lang == "zh-tw":
            lang = "zh_TW"
        source = make_concept_uri(unicodedata.normalize("NFKC", foreign), lang)
        target = make_concept_uri(german, "de", disambiguation)
        relation = "/r/TranslationOf"
        try:
            surfaceRel = "is %s for" % (langs.english_name(lang))
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        surfaceText = "[[%s]] %s [[%s (%s)]]" % (
            foreign,
            surfaceRel,
            english,
            disambiguation.split("/")[-1].replace("_", " "),
        )
        # print surfaceText
        edge = make_edge(
            relation,
            source,
            target,
            "/d/wiktionary/en/%s" % lang,
            license="/l/CC/By-SA",
            sources=[SOURCE, TRANSLATE],
            context="/ctx/all",
            weight=1.5,
            surfaceText=surfaceText,
        )
        self.writer.write(edge)

    def output_translation(self, foreign, english, locale=""):
        source = make_concept_uri(unicodedata.normalize("NFKC", foreign), self.langcode + locale)
        target = make_concept_uri(english, "en")
        relation = "/r/TranslationOf"
        try:
            surfaceRel = "is %s for" % (langs.english_name(self.langcode))
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
        edge = make_edge(
            relation,
            source,
            target,
            "/d/wiktionary/en/%s" % self.langcode,
            license="/l/CC/By-SA",
            sources=[SOURCE, INTERLINGUAL],
            context="/ctx/all",
            weight=1.5,
            surfaceText=surfaceText,
        )
        self.writer.write(edge)
Example #20
0
"""
Get data from DBPedia.
"""

__author__ = 'Justin Venezuela ([email protected]), Rob Speer ([email protected])'

from metanl.english import normalize_topic, un_camel_case
from conceptnet5.nodes import make_concept_uri, normalize_uri
from conceptnet5.edges import make_edge, MultiWriter, FlatEdgeWriter
import urllib
import urllib2

source = '/s/web/dbpedia.org'
WRITER_NUM = 1
writer = MultiWriter('dbpedia.%d' % WRITER_NUM)
sw_map = FlatEdgeWriter('data/sw/dbpedia.map.json')
sw_map_used = set()


def cycle_writer():
    global writer, WRITER_NUM
    writer.close()
    WRITER_NUM += 1
    writer = MultiWriter('dbpedia.%d' % WRITER_NUM)


def translate_wp_url(url):
    url = urllib.unquote(url).decode('utf-8', 'ignore')
    return un_camel_case(url.strip('/').split('/')[-1].split('#')[-1])

weights = defaultdict(float)
assertions = {}
ccby = defaultdict(bool)

for line in codecs.open('data/flat/CORE', encoding='utf-8'):
    uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9]
    if uri != 'uri' and context == '/ctx/all':
        weight = float(weight)
        weights[uri] += float(weight)
        assertions[uri] = (rel, start, end, context, weight)
        if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')):
            ccby[uri] = True

print 'writing'
writer_core = MultiWriter('assertion_totals_core')
#writer_sa = MultiWriter('assertion_totals_sa')

for uri, weight in assertions.iteritems():
    if ccby[uri]:
        license = '/l/CC/By'
        dataset = '/d/conceptnet/5/combined-core'
    else:
        license = '/l/CC/By-SA'
        dataset = '/d/conceptnet/5/combined-sa'
    relation, start, end, context, weight = assertions[uri]
    edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight)
    if license == '/l/CC/By':
        writer_core.write(edge)
    #else:
    #    writer_sa.write(edge)
Example #22
0
class FindTranslations(ContentHandler):
    def __init__(self):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = MultiWriter('wiktionary')

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False
    
    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        if line.startswith('===') and line.endswith('==='):
            pos = line.strip('= ')
            if pos == 'Synonyms':
                self.curRelation = 'Synonym'
            elif pos == 'Antonym':
                self.curRelation = 'Antonym'
            elif pos == 'Related terms':
                self.curRelation = 'ConceptuallyRelatedTo'
            elif pos == 'Derived terms':
                if not line.startswith('===='):
                    # this is at the same level as the part of speech;
                    # now we don't know what POS these apply to
                    self.pos = None
                self.curRelation = 'DerivedFrom'
            else:
                self.curRelation = None
                if pos in PARTS_OF_SPEECH:
                    self.pos = PARTS_OF_SPEECH[pos]
        elif language_match:
            self.lang = language_match.group(1)
            self.langcode = LANGUAGES.get(self.lang)
        elif chinese_match:
            scripttag = chinese_match.group(2)
            self.locales = []
            if 's' in scripttag:
                self.locales.append('_CN')
            if 't' in scripttag:
                self.locales.append('_TW')
        elif line[0:1] == '#' and self.lang != 'English' and self.lang is not None:
            defn = line[1:].strip()
            if defn[0:1] not in ':*#':
                for defn2 in filter_line(defn):
                    if not ascii_enough(defn2): continue
                    if 'Index:' in title: continue
                    if self.langcode == 'zh':
                        for locale in self.locales:
                            self.output_translation(title, defn2, locale)
                    elif self.langcode:
                        self.output_translation(title, defn2)
        elif line[0:4] == '----':
            self.pos = None
            self.lang = None
            self.langcode = None
            self.curRelation = None
        elif trans_top_match:
            pos = self.pos or 'n'
            sense = trans_top_match.group(1).split(';')[0].strip('.')
            if 'translations' in sense.lower():
                self.curSense = None
            else:
                self.curSense = pos+'/'+sense
        elif trans_tag_match:
            lang = trans_tag_match.group(1)
            translation = trans_tag_match.group(2)
            if self.curSense is not None and self.lang == 'English':
                # handle Chinese separately
                if lang not in ('cmn', 'yue', 'zh-yue', 'zh'):
                    self.output_sense_translation(lang, translation, title,
                                                  self.curSense)
        elif '{{trans-bottom}}' in line:
            self.curSense = None
        elif line.startswith('* ') and self.curRelation and self.langcode:
            relatedmatch = WIKILINK.search(line)
            if relatedmatch:
                related = relatedmatch.group(1)
                self.output_monolingual(self.langcode, self.curRelation,
                                        related, title)
    
    def output_monolingual(self, lang, relation, term1, term2):
        if 'Wik' in term1 or 'Wik' in term2:
            return
        source = make_concept_uri(term1, lang)
        if self.pos:
            target = make_concept_uri(term2, lang, self.pos)
        else:
            target = make_concept_uri(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        #print surfaceText

        edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, english, disambiguation):
        if 'Wik' in foreign or 'Wik' in english:
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = make_concept_uri(
          unicodedata.normalize('NFKC', foreign), lang
        )
        target = make_concept_uri(
          english, 'en', disambiguation
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(lang))
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' '))
        #print surfaceText
        edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, TRANSLATE],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
        
    def output_translation(self, foreign, english, locale=''):
        source = make_concept_uri(
          unicodedata.normalize('NFKC', foreign),
          self.langcode+locale
        )
        target = make_concept_uri(
          english, 'en'
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(self.langcode))
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
        edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, INTERLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Example #23
0
 def create_processes(self):
     for i in range(self.num_threads):
         writer = MultiWriter(self.writer_name + "_" + str(i), self.isTest)
         p = Process(target=self.pull_lines, args=(self.queue, writer))
         #p.daemon=True
         p.start()
class FindTranslations(ContentHandler):
    def __init__(self):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = MultiWriter('wiktionary')
        self.trans = False # in translation mode

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False
    
    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_bottom_match = TRANS_BOTTOM.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        
        ### Get translation        
        if trans_top_match: # start translation part
            self.trans = True
        if self.trans and trans_bottom_match: # end translation part
            self.trans = False
        if self.trans and line.startswith('*{{'): # get translation
            lang = line[3:5] # get language of translation
            # find all translations of that language
            translations = re.findall(u"\{\{Ü.*?\|.*?\|(.*?)\}\}", line)
            for translation in translations: # iterate over translations
                self.output_sense_translation(lang, translation, title, '')
        
        ### Get relation
        if line.startswith('{{Synonyme}}'): # synonym
            self.curRelation = 'synonym'
        elif line.startswith(u'{{Gegenwörter}}'): # antonym
            self.curRelation = 'antonym'
        elif line.startswith('{{Oberbegriffe}}'): # hypernym
            self.curRelation = 'hypernym'
        elif line.startswith('{{Unterbegriffe}}'): # hyponym
            self.curRelation = 'hyponym'
        elif line.startswith('{{Redewendungen}}'): # idiom
            self.curRelation = 'idiom'
        elif line.startswith('{{Charakteristische Wortkombinationen}}'): \
             # word combination
            self.curRelation = 'word combination'
        elif line.startswith('{{Wortbildungen}}'): # morphology
            self.curRelation = 'morphology'
        if self.curRelation and line == '': # end relation
            self.curRelation = None
        if self.curRelation:
            related_words_or_phrases = re.findall(r"\[\[(.*?)\]\]", line)
            for related_word in related_words_or_phrases:
                self.output_monolingual('deu', self.curRelation, \
                                            related_word, title)    
        
                
    def output_monolingual(self, lang, relation, term1, term2):
        if 'Wik' in term1 or 'Wik' in term2:
            return
        source = make_concept_uri(term1, lang)
        if self.pos:
            target = make_concept_uri(term2, lang, self.pos)
        else:
            target = make_concept_uri(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        #print surfaceText

        edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, german, disambiguation):
        if 'Wik' in foreign or 'Wik' in german:
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = make_concept_uri(
          unicodedata.normalize('NFKC', foreign), lang
        )
        target = make_concept_uri(
          german, 'de', disambiguation
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(lang))
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' '))
        #print surfaceText
        edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, TRANSLATE],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
        
    def output_translation(self, foreign, english, locale=''):
        source = make_concept_uri(
          unicodedata.normalize('NFKC', foreign),
          self.langcode+locale
        )
        target = make_concept_uri(
          english, 'en'
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(self.langcode))
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
        edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, INTERLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Example #25
0
counts = defaultdict(int)
text_similarities = []

flag_out = open('data/output/flagged_assertions.txt', 'w')
similar_out = open('data/output/text_similarity.txt', 'w')
weak_out = open('data/output/weak_assertions.txt', 'w')
good_out = open('data/output/ok_assertions.txt', 'w')
sources = ['/s/site/verbosity']




writer = None
if make_json:
    writer = MultiWriter('verbosity')

for line in open('raw_data/verbosity.txt'):

    parts = line.strip().split('\t')
    if not parts:
        counts['blank'] += 1
        continue
    left, relation, right, freq, orderscore = parts[:5]


    # catch bad stuff
    flagged = False

    for rword in right.split():
        if bad_regex_no_biscuit.match(rword):
Example #26
0
            if current_obj is None:
                current_obj = obj
                current_score = obj['weight']
                obj['surfaceRel'] = obj['rel']
            elif obj['weight'] == current_score:
                if normalize(obj['arg1']) == normalize(
                        current_obj['arg1']) and normalize(
                            obj['arg2']) == normalize(current_obj['arg2']):
                    current_obj['rel'] = obj['rel']
                output_edge(current_obj, writer)
                current_obj = None
                current_score = None
            else:
                if current_obj is not None:
                    output_edge(current_obj, writer)
                current_obj = obj
                current_score = obj['weight']
                obj['surfaceRel'] = obj['rel']
    if current_obj is not None:
        output_edge(current_obj, writer)

    writer.close()


if __name__ == '__main__':

    writer = MultiWriter('reverb-wp-frontpage')
    for file_to_read in REVERB_FILES:
        lines = codecs.open(file_to_read, encoding='utf-8', errors='replace')
        handle_lines(lines, writer)
Example #27
0
        sources = [([creator_node, activity_node], 1)]

        for vote in raw.votes.all():
            sources.append(([normalize_uri('/s/contributor/omcs/'+vote.user.username),
                             normalize_uri(u'/s/activity/omcs/vote')], vote.vote))
        
        for source_list, weight in sources:
            bad = False
            if 'commons2_reject' in ' '.join(source_list):
                weight = -1
            start = make_concept_uri(startText, lang)
            end = make_concept_uri(endText, lang)
            if 'bedume' in ' '.join(source_list):
                for flagged in BEDUME_FLAGGED_CONCEPTS + BEDUME_FLAGGED_PLACES:
                    check = '/'+flagged.replace(' ', '_')
                    if start.endswith(check) or end.endswith(check):
                        bad = True
                        print "flagged:", str(raw)
                        break
            if not bad:
                edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight)
                writer.write(edge)
    except Exception:
        import traceback
        traceback.print_exc()

if __name__ == '__main__':
    writer = MultiWriter('conceptnet4')
    queryset_foreach(RawAssertion.objects.filter(), lambda item: handle_raw_assertion(item, writer))
    writer.close()
Example #28
0
        dataset = normalize_uri('/d/nadya.jp')
        score = raw.score

        sources = [([activity_node], score / 5.)]

        for source_list, weight in sources:
            if 'commons2_reject' in ' '.join(source_list):
                weight = -1
            start = make_concept_uri(startText, lang)
            end = make_concept_uri(endText, lang)
            edge = make_edge(relation,
                             start,
                             end,
                             dataset,
                             LICENSE,
                             source_list,
                             '/ctx/all',
                             frame_text,
                             weight=weight)
            writer.write(edge)
    except Exception:
        import traceback
        traceback.print_exc()


if __name__ == '__main__':
    writer = MultiWriter('nadya.jp')
    queryset_foreach(RawAssertion.objects.filter(),
                     lambda item: handle_raw_assertion(item, writer))
    writer.close()
Example #29
0
from conceptnet.models import *
import os
import codecs
from conceptnet5.nodes import make_concept_uri
from conceptnet5.edges import make_edge, MultiWriter

sparse_pieces = []
for filename in os.listdir('.'):
    if filename.startswith('conceptnet_zh_'):
        writer = MultiWriter(filename.split('.')[0])
        for line in codecs.open(filename, encoding='utf-8', errors='replace'):
            line = line.strip()
            if line:
                parts = line.split(', ')
                user, frame_id, concept1, concept2 = parts
                frame = Frame.objects.get(id=int(frame_id))
                ftext = frame.text
                relation = frame.relation.name
                rel = '/r/' + relation

                surfaceText = ftext.replace(u'{1}',
                                            u'[[' + concept1 + u']]').replace(
                                                u'{2}',
                                                u'[[' + concept2 + u']]')
                start = make_concept_uri(concept1, 'zh_TW')
                end = make_concept_uri(concept2, 'zh_TW')
                sources = [
                    '/s/contributor/petgame/' + user, '/s/activity/ntt/petgame'
                ]
                edge = make_edge(rel,
                                 start,
Example #30
0
class FindTranslations(ContentHandler):
    def __init__(self):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = MultiWriter('wiktionary')

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False

    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        if line.startswith('===') and line.endswith('==='):
            pos = line.strip('= ')
            if pos == 'Synonyms':
                self.curRelation = 'Synonym'
            elif pos == 'Antonym':
                self.curRelation = 'Antonym'
            elif pos == 'Related terms':
                self.curRelation = 'ConceptuallyRelatedTo'
            elif pos == 'Derived terms':
                if not line.startswith('===='):
                    # this is at the same level as the part of speech;
                    # now we don't know what POS these apply to
                    self.pos = None
                self.curRelation = 'DerivedFrom'
            else:
                self.curRelation = None
                if pos in PARTS_OF_SPEECH:
                    self.pos = PARTS_OF_SPEECH[pos]
        elif language_match:
            self.lang = language_match.group(1)
            self.langcode = LANGUAGES.get(self.lang)
        elif chinese_match:
            scripttag = chinese_match.group(2)
            self.locales = []
            if 's' in scripttag:
                self.locales.append('_CN')
            if 't' in scripttag:
                self.locales.append('_TW')
        elif line[
                0:
                1] == '#' and self.lang != 'English' and self.lang is not None:
            defn = line[1:].strip()
            if defn[0:1] not in ':*#':
                for defn2 in filter_line(defn):
                    if not ascii_enough(defn2): continue
                    if 'Index:' in title: continue
                    if self.langcode == 'zh':
                        for locale in self.locales:
                            self.output_translation(title, defn2, locale)
                    elif self.langcode:
                        self.output_translation(title, defn2)
        elif line[0:4] == '----':
            self.pos = None
            self.lang = None
            self.langcode = None
            self.curRelation = None
        elif trans_top_match:
            pos = self.pos or 'n'
            sense = trans_top_match.group(1).split(';')[0].strip('.')
            if 'translations' in sense.lower():
                self.curSense = None
            else:
                self.curSense = pos + '/' + sense
        elif trans_tag_match:
            lang = trans_tag_match.group(1)
            translation = trans_tag_match.group(2)
            if self.curSense is not None and self.lang == 'English':
                # handle Chinese separately
                if lang not in ('cmn', 'yue', 'zh-yue', 'zh'):
                    self.output_sense_translation(lang, translation, title,
                                                  self.curSense)
        elif '{{trans-bottom}}' in line:
            self.curSense = None
        elif line.startswith('* ') and self.curRelation and self.langcode:
            relatedmatch = WIKILINK.search(line)
            if relatedmatch:
                related = relatedmatch.group(1)
                self.output_monolingual(self.langcode, self.curRelation,
                                        related, title)

    def output_monolingual(self, lang, relation, term1, term2):
        if 'Wik' in term1 or 'Wik' in term2:
            return
        source = make_concept_uri(term1, lang)
        if self.pos:
            target = make_concept_uri(term2, lang, self.pos)
        else:
            target = make_concept_uri(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        print surfaceText

        edge = make_edge('/r/' + relation,
                         source,
                         target,
                         '/d/wiktionary/%s/%s' % (lang, lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, english, disambiguation):
        if 'Wik' in foreign or 'Wik' in english:
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = make_concept_uri(unicodedata.normalize('NFKC', foreign), lang)
        target = make_concept_uri(english, 'en', disambiguation)
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(lang))
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        surfaceText = "[[%s]] %s [[%s (%s)]]" % (
            foreign, surfaceRel, english,
            disambiguation.split('/')[-1].replace('_', ' '))
        print surfaceText
        edge = make_edge(relation,
                         source,
                         target,
                         '/d/wiktionary/en/%s' % lang,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, TRANSLATE],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_translation(self, foreign, english, locale=''):
        source = make_concept_uri(unicodedata.normalize('NFKC', foreign),
                                  self.langcode + locale)
        target = make_concept_uri(english, 'en')
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(self.langcode))
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
        edge = make_edge(relation,
                         source,
                         target,
                         '/d/wiktionary/en/%s' % self.langcode,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, INTERLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Example #31
0
def cycle_writer():
    global writer, WRITER_NUM
    writer.close()
    WRITER_NUM += 1
    writer = MultiWriter('dbpedia.%d' % WRITER_NUM)
Example #32
0
from conceptnet5.nodes import make_concept_uri
from conceptnet5.edges import MultiWriter, make_edge

import yaml
userdata = yaml.load_all(open('./GMUser.yaml'))
users = {}
writer = MultiWriter('globalmind')

lang_codes = {
    'eng': 'en',
    'cht': 'zh_TW',
    'chs': 'zh_CN',
    'jpn': 'ja',
    'kor': 'ko',
    'spa': 'es',
}

lang_names = {
    'eng': 'English',
    'en': 'English',
    'cht': 'Traditional Chinese',
    'zh_TW': 'Traditional Chinese',
    'chs': 'Simplified Chinese',
    'zh_CN': 'Simplified Chinese',
    'jpn': 'Japanese',
    'ja': 'Japanese',
    'kor': 'Korean',
    'ko': 'Korean',
    'spa': 'Spanish',
    'es': 'Spanish'
}
class FindTranslations(ContentHandler):
    def __init__(self):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = MultiWriter('wiktionary_ja')
        self.nosensetrans = None # non-sense-specific translation

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False
    
    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_bottom_match = TRANS_BOTTOM.match(line)
        trans_match = TRANS.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)

        if language_match:
            self.langcode = get_language_code(language_match.group(1))
        
        ### Get sense-specific translation
        if trans_top_match: # start translation part
            pos = self.pos or 'n'
            # get translation sense
            if trans_top_match.group(1):
                sense = trans_top_match.group(1).lstrip('|')
                self.curSense = pos+'/'+sense
                return
            else:
                self.curSense = pos
                return
        if trans_bottom_match: # end translation part
            self.curSense = None
            return
        if self.curSense and line[0:5] == '*[[{{': # get translation
            lang = line[5:].split('}')[0]  # get language of translation
            if lang in LANGUAGES_3_TO_2:   # convert 3-letter code to 2-letter code
                lang = LANGUAGES_3_TO_2[lang]
            # find all translations of that language
            translations = re.findall(r"\[\[(.*?)\]\]", line)[1:] 
            for translation in translations: # iterate over translations
                self.output_sense_translation(lang, translation, title, \
                                              self.curSense)
            return

        ### Get relation
        if line.startswith('===={{rel}}===='): # start relation part
            self.curRelation = 'ConceptuallyRelatedTo'
            return
        if self.curRelation and self.langcode: # within relation part
            if line.startswith('*'): # get relation
                relations = re.findall(r"\{\{(.*?)\}\}", line)
                if len(relations) > 0:
                    if relations[0] == 'syn': # synonym
                        self.curRelation = 'Synonym'
                    if relations[0] == 'drv': # derivative
                        self.curRelation = 'Derivative'                    
                related_words = re.findall(r"\[\[(.*?)\]\]", line)
                for related_word in related_words:
                    self.output_monolingual(self.langcode, self.curRelation, \
                                            related_word, title)
                self.curRelation = 'ConceptuallyRelatedTo' # back to default
            else:
                self.curRelation = None

        ### Get non-sense-specific translation
        if trans_match: 
            self.nosensetrans = 1 # *maybe* start non-sense-specific translation
        if self.nosensetrans == 1 and line.startswith('{{top}}'):
            self.nosensetrans = 2 # start non-sense-specific translation            
        if self.nosensetrans == 2:
            if line.startswith('{{bottom}}'):
                self.nosensetrans = None
                return
            if line.startswith('*{{'):
                lang = line[3:].split('}')[0]
                if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code
                    lang = LANGUAGES_3_TO_2[lang]
                translations = re.findall(r"\[\[(.*?)\]\]", line)
                for translation in translations:
                    self.output_sense_translation(lang, translation, title, '')
    
    def output_monolingual(self, lang, relation, term1, term2):
        # skip Wiktionary: links and templates
        if u'ウィク' in term1 or u'ウィク' in term2:
            return
        if u'テンプレート' in term1 or u'テンプレート' in term2:
            return

        if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code
            lang = LANGUAGES_3_TO_2[lang]
        source = make_concept_uri_safe(term1, lang)
        if self.pos:
            target = make_concept_uri_safe(term2, lang, self.pos)
        else:
            target = make_concept_uri_safe(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        #print surfaceText

        edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/ja/%s' % (lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, translated, disambiguation):
        if u':' in foreign or u':' in translated:
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = make_concept_uri_safe(
          unicodedata.normalize('NFKC', foreign), lang
        )
        target = make_concept_uri_safe(
          translated, self.langcode, disambiguation
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(lang))
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        if disambiguation and '/' in disambiguation:
            surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, translated, disambiguation.split('/')[-1].replace('_', ' '))
        else:
            surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, translated)
        #print surfaceText
        edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % (self.langcode),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, TRANSLATE],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
        
    def output_translation(self, foreign, japanese, locale=''):
        source = make_concept_uri_safe(
          unicodedata.normalize('NFKC', foreign),
          self.langcode+locale
        )
        target = make_concept_uri_safe(
          japanese, 'ja'
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(self.langcode))
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, japanese)
        edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % self.langcode,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, INTERLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Example #34
0
counts = defaultdict(int)
text_similarities = []

flag_out = open('data/output/flagged_assertions.txt', 'w')
similar_out = open('data/output/text_similarity.txt', 'w')
weak_out = open('data/output/weak_assertions.txt', 'w')
good_out = open('data/output/ok_assertions.txt', 'w')
sources = ['/s/site/verbosity']




writer = None
if make_json:
    writer = MultiWriter('verbosity')

for line in open('raw_data/verbosity.txt'):

    parts = line.strip().split('\t')
    if not parts:
        counts['blank'] += 1
        continue
    left, relation, right, freq, orderscore = parts[:5]


    # catch bad stuff
    flagged = False

    for rword in right.split():
        if bad_regex_no_biscuit.match(rword):
Example #35
0
ccby = defaultdict(bool)

for line in codecs.open('data/flat/CORE', encoding='utf-8'):
    uri, rel, start, end, context, weight, sources, id, dataset = line.split(
        '\t')[:9]
    if uri != 'uri' and context == '/ctx/all':
        weight = float(weight)
        weights[uri] += float(weight)
        assertions[uri] = (rel, start, end, context, weight)
        if not (dataset.startswith('/d/reverb')
                or dataset.startswith('/d/wiktionary')
                or dataset.startswith('/d/dbpedia')):
            ccby[uri] = True

print 'writing'
writer_core = MultiWriter('assertion_totals_core')
#writer_sa = MultiWriter('assertion_totals_sa')

for uri, weight in assertions.iteritems():
    if ccby[uri]:
        license = '/l/CC/By'
        dataset = '/d/conceptnet/5/combined-core'
    else:
        license = '/l/CC/By-SA'
        dataset = '/d/conceptnet/5/combined-sa'
    relation, start, end, context, weight = assertions[uri]
    edge = make_edge(relation,
                     start,
                     end,
                     dataset,
                     license, ['/s/rule/sum_edges'],