def __init__(self, lang='en'): try: assertion_pickle = open('data/concept_assertion.pickle', 'rb') self.concept_assertions = pickle.load(assertion_pickle) assertion_pickle.close() except: self.concept_assertions = {} self.nlp = simplenlp.get(lang) self.stopwords = stopwords.words('english') self.stopwords.extend(concept_stopwords)
def lookup_concept_from_nl(language, text): """ Look up a concept using any natural language text that represents it. This function requires the :mod:`simplenlp` module to normalize natural language text into a raw concept name. """ import simplenlp nltools = simplenlp.get('en') normalized = nltools.normalize(text) return lookup_concept_raw(language, normalized)
def make_concept_uri(text, lang, disambiguation=None): if lang == 'en': from metanl import english normalized, disambig = english.normalize_topic(text) elif lang == 'ja': from metanl import japanese normalized, disambig = japanese.normalize(text), None elif lang in ('pt', 'hu', 'nl', 'es'): # languages where we know the stopword list import simplenlp nlp = simplenlp.get(lang) disambig = None normalized, disambig = nlp.normalize(text), None else: normalized = text disambig = None if disambiguation is not None: disambig = disambiguation if disambig is not None: disambig = disambig.replace(' ', '_') if disambig: return '/c/%s/%s/%s' % (lang, normalized.replace(' ', '_'), disambig) else: return '/c/%s/%s' % (lang, normalized.replace(' ', '_'))
import simplenlp EN = simplenlp.get('en') from conceptnet5.english_nlp import normalize def check_line(line): parts = line.strip().split() norm = normalize(parts[0]) if norm != parts[1]: print "Original: %s / WordNet: %s / conceptnet: %s" %\ (parts[0], parts[1], norm) def compare_words(): for line in open('/Users/rspeer/nltk_data/corpora/wordnet/noun.exc'): check_line(line) for line in open('/Users/rspeer/nltk_data/corpora/wordnet/verb.exc'): check_line(line) if __name__ == '__main__': compare_words()
# -*- coding: utf-8 -*- import nltk from nltk.corpus import wordnet import simplenlp import re EN = simplenlp.get('en') try: morphy = wordnet.morphy except LookupError: nltk.download('wordnet') morphy = wordnet.morphy STOPWORDS = ['the', 'a', 'an'] EXCEPTIONS = { # Avoid obsolete and obscure roots, the way lexicographers don't. 'wrought': 'wrought', # not 'work' 'media': 'media', # not 'medium' 'installed': 'install', # not 'instal' 'installing': 'install',# not 'instal' 'synapses': 'synapse', # not 'synapsis' 'soles': 'sole', # not 'sol' 'pubes': 'pube', # not 'pubis' 'dui': 'dui', # not 'duo' 'comics': 'comic', # WordNet's root for this will make you nerd-rage 'taxis': 'taxi', # not 'taxis' 'teeth': 'tooth', # not 'teeth' # Avoid nouns that shadow more common verbs. 'am': 'be',
def put_raw_assertion_in_graph(raw): try: lang = raw.language_id if raw.frame.goodness < 1: return if lang.startswith('zh'): return polarity = raw.frame.frequency.value activity = raw.sentence.activity.name if 'rubycommons' in activity: return # build the assertion raw_arg1 = GRAPH.get_or_create_concept(lang, raw.text1) raw_arg2 = GRAPH.get_or_create_concept(lang, raw.text2) frame_text = raw.frame.text if polarity > 0: frame_text = frame_text.replace('{%}', '') else: frame_text = frame_text.replace('{%}', 'not') frame = GRAPH.get_or_create_frame(lang, frame_text) raw_assertion = GRAPH.get_or_create_assertion( frame, [raw_arg1, raw_arg2], {'dataset': 'conceptnet/4/'+lang, 'license': 'CC-By', 'normalized': False} ) # create justification structure creator = raw.sentence.creator.username if creator == 'verbosity': return creator_node = GRAPH.get_or_create_node( u'/source/contributor/omcs/'+creator ) activity_node = GRAPH.get_or_create_node(u'/source/activity/omcs/'+activity) GRAPH.justify(OMCS, activity_node) GRAPH.justify(OMCS, creator_node) conjunction = GRAPH.get_or_create_conjunction( [creator_node, activity_node] ) GRAPH.justify(conjunction, raw_assertion) # make the normalized version if lang == 'en': arg1 = GRAPH.get_or_create_concept('en', en_normalize(raw.text1)) arg2 = GRAPH.get_or_create_concept('en', en_normalize(raw.text2)) elif lang == 'ja': arg1 = GRAPH.get_or_create_concept('ja', JA.normalize(raw.text1)) arg2 = GRAPH.get_or_create_concept('ja', JA.normalize(raw.text2)) else: nlp = simplenlp.get(lang) arg1 = GRAPH.get_or_create_concept(lang, nlp.normalize(raw.text1)) arg2 = GRAPH.get_or_create_concept(lang, nlp.normalize(raw.text2)) if polarity > 0: relation = GRAPH.get_or_create_relation(raw.frame.relation.name) else: relation = GRAPH.get_or_create_relation('Not'+raw.frame.relation.name) assertion = GRAPH.get_or_create_assertion( relation, [arg1, arg2], {'dataset': 'conceptnet/4/'+lang, 'license': 'CC-By', 'normalized': True} ) for vote in raw.votes.all(): voter = GRAPH.get_or_create_node( u'/source/contributor/omcs/'+vote.user.username ) GRAPH.justify(OMCS, voter) GRAPH.justify(voter, raw_assertion, weight=vote.vote) GRAPH.derive_normalized(raw_assertion, assertion) print assertion except Exception: import traceback traceback.print_exc()
from csc_utils.batch import queryset_foreach from conceptnet.models import Sentence, Assertion, RawAssertion from conceptnet5.graph import JSONWriterGraph from conceptnet5.english_nlp import normalize as en_normalize import simplenlp GRAPH = JSONWriterGraph('json_data/conceptnet') OMCS = GRAPH.get_or_create_node('/source/site/omcs') GRAPH.justify('/', OMCS) JA = simplenlp.get('ja') # monkey-patch def answer_false(*args): return False JA.is_stopword_record = answer_false def put_raw_assertion_in_graph(raw): try: lang = raw.language_id if raw.frame.goodness < 1: return if lang.startswith('zh'): return polarity = raw.frame.frequency.value activity = raw.sentence.activity.name if 'rubycommons' in activity: return # build the assertion raw_arg1 = GRAPH.get_or_create_concept(lang, raw.text1) raw_arg2 = GRAPH.get_or_create_concept(lang, raw.text2) frame_text = raw.frame.text if polarity > 0:
import simplenlp from metanl import english import math, random from luminoso3.background_space import get_commonsense_assoc from colorizer.color_data import make_lab_color_data, lab_to_rgb, rgb_to_hsv from colorizer.colorvote import weighted_elect_samples ENGLISH = simplenlp.get('en') ASSOC = get_commonsense_assoc('en', 100) COLORDATA = {} origdata = make_lab_color_data() def importance_factor(colorname): imp = 10000 / math.sqrt(english.word_frequency(colorname.split()[0], 1000000)) return int(imp) for key, values in origdata.items(): subset_values = random.sample(values, min(len(values), int(math.ceil(importance_factor(key)*math.sqrt(len(values)))))) COLORDATA[key] = subset_values def output_colors(labcolors): return [lab_to_rgb(c) for c in sorted(labcolors)] class IncrementalColorizer(object): def __init__(self, ncolors): self.ncolors = ncolors self.colors = [(128,128,128)] * ncolors
def test_nai(): ja = simplenlp.get('ja') test_sentence = u'いいえ、分かりませんでした。' assert ja.normalize(test_sentence) == u'いいえ 分かる ない'
def test_normalize(): ja = simplenlp.get('ja') test_sentence = u'これはテストですが、大丈夫です。' assert ja.normalize_list(test_sentence) == [u'テスト', u'大丈夫'] assert ja.normalize(test_sentence) == u'テスト 大丈夫'
def test_chinese(): zh = simplenlp.get('zh-Hant') railway = u"迪士尼线" assert zh.normalize(railway) == railway