def cyc_to_conceptnet_uri(labels, unlabels, uri): """ Convert a Cyc URI to a ConceptNet URI, with the following rules: - Use the RDF label as the text. (Alternate labels appear to provide synonyms, but these are generally automatically generated and aren't particularly accurate.) - The part of speech is always 'n'. Cyc describes its concepts in a noun-like way. At worst, they're gerunds -- instead of "to eat", Cyc would define an event of "Eating". - If two different Cyc URIs have the same text, we will attempt to disambiguate them using the last component of the Cyc URI. - Remove the camel-casing from the Cyc URI component. If the phrase we get is the same as the natural-language label, disregard it as an uninformative disambiguation. Otherwise, that is the disambiguation text. A possible objection: Our disambiguation doesn't distinguish Cyc URIs that differ in capitalization, or differ by using underscores instead of camel-case. However, I've noticed that such URIs are usually *unintentional* duplicates that are okay to merge. If they were really unrelated concepts that needed to be distinguished, someone would have given them different names. Even so, we end up with some unnecessary word senses, such as different senses for "mens clothing", "men's clothing", and "men s clothing". """ label = filter_stopwords(labels[uri]) if len(unlabels[label]) >= 2: disambig = filter_stopwords(un_camel_case(resource_name(uri))) if simple_tokenize(disambig) != simple_tokenize(label): return standardized_concept_uri('en', label, 'n', 'opencyc', disambig) return standardized_concept_uri('en', label, 'n')
def run_opencyc(input_file, output_file): """ Read an .nq file containing OpenCyc data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) labels = {} unlabels = defaultdict(set) seen_external_urls = set() # Read through the file once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for subj, pred, obj, _graph in parse_nquads( open(input_file, encoding='utf-8')): if pred['url'] == RDF_LABEL: labels[subj['url']] = obj['text'] unlabels[obj['text']].add(subj['url']) # Read through the file again and extract ConceptNet edges. for subj, pred, obj, _graph in parse_nquads( open(input_file, encoding='utf-8')): rel_name = resource_name(pred['url']) web_subj = subj.get('url') web_obj = obj.get('url') if rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels: subj_label = labels[web_subj] obj_label = labels[web_obj] if '_' in subj_label or '_' in obj_label: continue if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'): continue subj_words = set(simple_tokenize(subj_label)) obj_words = set(simple_tokenize(obj_label)) if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS): continue if len(subj_words) > 4 or len(obj_words) > 4: continue subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj) obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj) out.write( opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label, obj_label)) if (subj_uri, web_subj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_subj)) seen_external_urls.add((subj_uri, web_subj)) if (obj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(obj_uri, web_obj)) seen_external_urls.add((obj_uri, web_obj)) elif rel_name == 'sameAs' and web_subj in labels and web_obj.startswith( 'http://umbel.org/'): subj_label = labels[web_subj] subj_uri = standardized_concept_uri('en', subj_label) if (subj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_obj)) seen_external_urls.add((subj_uri, web_obj)) out.close()
def run_opencyc(input_file, output_file): """ Read an .nq file containing OpenCyc data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) labels = {} unlabels = defaultdict(set) seen_external_urls = set() # Read through the file once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')): if pred['url'] == RDF_LABEL: labels[subj['url']] = obj['text'] unlabels[obj['text']].add(subj['url']) # Read through the file again and extract ConceptNet edges. for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')): rel_name = resource_name(pred['url']) web_subj = subj.get('url') web_obj = obj.get('url') if rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels: subj_label = labels[web_subj] obj_label = labels[web_obj] if '_' in subj_label or '_' in obj_label: continue if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'): continue subj_words = set(simple_tokenize(subj_label)) obj_words = set(simple_tokenize(obj_label)) if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS): continue if len(subj_words) > 4 or len(obj_words) > 4: continue subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj) obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj) out.write(opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label, obj_label)) if (subj_uri, web_subj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_subj)) seen_external_urls.add((subj_uri, web_subj)) if (obj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(obj_uri, web_obj)) seen_external_urls.add((obj_uri, web_obj)) elif rel_name == 'sameAs' and web_subj in labels and web_obj.startswith('http://umbel.org/'): subj_label = labels[web_subj] subj_uri = standardized_concept_uri('en', subj_label) if (subj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_obj)) seen_external_urls.add((subj_uri, web_obj)) out.close()
def standardize_as_list(text, token_filter=None): """ Get a list of tokens or stems that appear in the text. `token_filter` is an optional function to apply to the list of tokens, performing language-specific lemmatization and stopword removal. In practice, the only such filter is for English. >>> standardize_as_list('the dog', token_filter=english_filter) ['dog'] >>> standardize_as_list('big dogs', token_filter=english_filter) ['big', 'dog'] >>> standardize_as_list('big dogs') ['big', 'dogs'] >>> standardize_as_list('to go', token_filter=english_filter) ['go'] >>> standardize_as_list('the', token_filter=english_filter) ['the'] >>> standardize_as_list('to', token_filter=english_filter) ['to'] """ text = fix_text(text) tokens = [token for token in simple_tokenize(text)] if token_filter is not None: tokens = token_filter(tokens) return tokens
def filter_stopwords(text): words = [ word for word in simple_tokenize(text) if word not in MORE_STOPWORDS ] text2 = ' '.join(words) if not text2: text2 = text return text2
def standardize_text(text, token_filter=None): """ Get a string made from the tokens in the text, joined by underscores. The tokens may have a language-specific `token_filter` applied to them. See `standardize_as_list()`. >>> standardize_text(' cat') 'cat' >>> standardize_text('a big dog', token_filter=english_filter) 'big_dog' >>> standardize_text('Italian supercat') 'italian_supercat' >>> standardize_text('a big dog') 'a_big_dog' >>> standardize_text('a big dog', token_filter=english_filter) 'big_dog' >>> standardize_text('to go', token_filter=english_filter) 'go' >>> standardize_text('Test?!') 'test' >>> standardize_text('TEST.') 'test' >>> standardize_text('test/test') 'test_test' >>> standardize_text(' u\N{COMBINING DIAERESIS}ber\\n') 'über' >>> standardize_text('embedded' + chr(9) + 'tab') 'embedded_tab' >>> standardize_text('_') '' >>> standardize_text(',') '' """ tokens = simple_tokenize(text.replace('_', ' ')) if token_filter is not None: tokens = token_filter(tokens) return '_'.join(tokens)
def filter_stopwords(text): words = [word for word in simple_tokenize(text) if word not in MORE_STOPWORDS] text2 = " ".join(words) if not text2: text2 = text return text2