def keep_concept(uri): # FIXME: possibly we should use the 'is_valid_concept' check that we use # elsewhere if is_absolute_url(uri): return True if get_uri_language(uri) not in ALL_LANGUAGES: return False if not valid_language(get_uri_language(uri)): return False pieces = split_uri(uri) return bool(pieces[2])
def keep_concept(uri): if is_absolute_url(uri): return True if get_uri_language(uri) not in ALL_LANGUAGES: return False pieces = split_uri(uri) return bool(pieces[2])
def prepare_vocab_for_morphology(language, input, output): vocab_counts = defaultdict(int) for line in input: countstr, uri = line.strip().split(' ', 1) if get_uri_language(uri) == language: term = split_uri(uri)[2] if language in ATOMIC_SPACE_LANGUAGES: term += '_' vocab_counts[term] += int(countstr) for term, count in sorted(list(vocab_counts.items())): print(count, term, file=output)
def export_plain_text(table, uri_file, file_base): from ..vectors.query import VectorSpaceWrapper def vec_to_text_line(label, vec): cells = [label] + ['%4.4f' % val for val in vec] return ' '.join(cells) uri_main_file = gzip.open(file_base + '_uris_main.txt.gz', 'wt') english_main_file = gzip.open(file_base + '_en_main.txt.gz', 'wt') english_extra_file = gzip.open(file_base + '_en_extra.txt.gz', 'wt') wrap = VectorSpaceWrapper(frame=table) for line in open(uri_file, encoding='utf-8'): uri = line.strip() if uri.count('/') == 3 and get_uri_language(uri) in COMMON_LANGUAGES: if uri in table.index: vec = table.loc[uri].values print(vec_to_text_line(uri, vec), file=uri_main_file) else: if not uri.startswith('/c/en') or '_' in uri: continue vec = wrap.get_vector(uri) if vec.dot(vec) == 0: continue if uri.startswith('/c/en/'): label = uri[6:] if uri in table.index: print(vec_to_text_line(label, vec), file=english_main_file) else: print(vec_to_text_line(label, vec), file=english_extra_file) uri_main_file.close() english_main_file.close() english_extra_file.close()
def export_plain_text(table, uri_file, file_base): from ..vectors.query import VectorSpaceWrapper def vec_to_text_line(label, vec): cells = [label] + ['%4.4f' % val for val in vec] return ' '.join(cells) uri_main_file = gzip.open(file_base + '_uris_main.txt.gz', 'wt') english_main_file = gzip.open(file_base + '_en_main.txt.gz', 'wt') english_extra_file = gzip.open(file_base + '_en_extra.txt.gz', 'wt') wrap = VectorSpaceWrapper(frame=table) for line in open(uri_file, encoding='utf-8'): uri = line.strip() if uri.count('/') == 3 and get_uri_language(uri) in COMMON_LANGUAGES: if uri in table.index: vec = table.loc[uri].values print(vec_to_text_line(uri, vec), file=uri_main_file) else: if not uri.startswith('/c/en') or '_' in uri: continue vec = wrap.get_vector(uri) if vec.dot(vec) == 0: continue if uri.startswith('/c/en/'): label = uri[6:] if uri in table.index: print(vec_to_text_line(label, vec), file=english_main_file) else: print(vec_to_text_line(label, vec), file=english_extra_file) uri_main_file.close() english_main_file.close() english_extra_file.close()
def msgpack_to_assoc(input_filename, output_filename): """ Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept associations. The relation is mostly ignored, except: - An assertion that means "People want X" in English or Chinese is converted to an association between X and "good" - An assertion that "People don't want X" is converted to an association between X and "bad" The result can be used to predict word associations using ConceptNet by using dimensionality reduction, as in the `assoc_space` package. FIXME: the above is out of date, we use conceptnet5.vectors now The relation is mostly ignored because we have not yet found a good way to take the relation into account in dimensionality reduction. """ with open(output_filename, 'w', encoding='utf-8') as out_stream: weight_by_dataset = defaultdict(float) count_by_dataset = defaultdict(int) prefixed = set() for info in read_msgpack_stream(input_filename): start_uri = info['start'] end_uri = info['end'] if not ( get_uri_language(start_uri) in COMMON_LANGUAGES and get_uri_language(end_uri) in COMMON_LANGUAGES ): continue rel = info['rel'] weight = info['weight'] dataset = info['dataset'] pairs = [] for uri in (start_uri, end_uri): pieces = split_uri(uri) if len(pieces) > 3 and (uri, dataset) not in prefixed: prefix = join_uri(*pieces[:3]) prefixed.add((uri, dataset)) line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=uri, end=prefix, weight=1., dataset=dataset, rel='/r/SenseOf' ) weight_by_dataset[dataset] += 1. count_by_dataset[dataset] += 1 print(line, file=out_stream) if start_uri == '/c/en/person': if rel == '/r/Desires': pairs = [('/c/en/good', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/en/bad', end_uri)] else: pairs = [(start_uri, end_uri)] elif start_uri == '/c/zh/人': if rel == '/r/Desires': pairs = [('/c/zh/良好', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/zh/不良', end_uri)] else: pairs = [(start_uri, end_uri)] else: pairs = [(start_uri, end_uri)] for (start, end) in pairs: line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=start, end=end, weight=weight, dataset=dataset, rel=rel ) weight_by_dataset[dataset] += weight count_by_dataset[dataset] += 1 print(line, file=out_stream) avg_weight_by_dataset = { dataset: weight_by_dataset[dataset] / count_by_dataset[dataset] for dataset in count_by_dataset } print("Average weights:") print(avg_weight_by_dataset)
def msgpack_to_assoc(input_filename, output_filename): """ Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept associations. The relation is mostly ignored, except: - An assertion that means "People want X" in English or Chinese is converted to an association between X and "good" - An assertion that "People don't want X" is converted to an association between X and "bad" The result can be used to predict word associations using ConceptNet by using dimensionality reduction, as in the `assoc_space` package. The relation is mostly ignored because we have not yet found a good way to take the relation into account in dimensionality reduction. """ with open(output_filename, 'w', encoding='utf-8') as out_stream: weight_by_dataset = defaultdict(float) count_by_dataset = defaultdict(int) prefixed = set() for info in read_msgpack_stream(input_filename): start_uri = info['start'] end_uri = info['end'] if not ( start_uri.startswith('/c/') and end_uri.startswith('/c/') and get_uri_language(start_uri) in COMMON_LANGUAGES and get_uri_language(end_uri) in COMMON_LANGUAGES ): continue rel = info['rel'] weight = info['weight'] dataset = info['dataset'] pairs = [] for uri in (start_uri, end_uri): pieces = split_uri(uri) if len(pieces) > 3 and (uri, dataset) not in prefixed: prefix = join_uri(*pieces[:3]) prefixed.add((uri, dataset)) line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=uri, end=prefix, weight=1., dataset=dataset, rel='/r/SenseOf' ) weight_by_dataset[dataset] += 1. count_by_dataset[dataset] += 1 print(line, file=out_stream) if start_uri == '/c/en/person': if rel == '/r/Desires': pairs = [('/c/en/good', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/en/bad', end_uri)] else: pairs = [(start_uri, end_uri)] elif start_uri == '/c/zh/人': if rel == '/r/Desires': pairs = [('/c/zh/良好', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/zh/不良', end_uri)] else: pairs = [(start_uri, end_uri)] else: pairs = [(start_uri, end_uri)] for (start, end) in pairs: line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=start, end=end, weight=weight, dataset=dataset, rel=rel ) weight_by_dataset[dataset] += weight count_by_dataset[dataset] += 1 print(line, file=out_stream) avg_weight_by_dataset = { dataset: weight_by_dataset[dataset] / count_by_dataset[dataset] for dataset in count_by_dataset } print("Average weights:") print(avg_weight_by_dataset)