def cat_type_index(): f1 = open("entity_category.json", "r") entity_cat = json.load(f1) f2 = open("entity_type.json", "r") entity_type = json.load(f2) index_name = "dbpedia_2015_10_type_cat" mappings = { "type_n": Elastic.notanalyzed_field(), "type_a": Elastic.analyzed_field(), "category_n": Elastic.notanalyzed_field(), "category_a": Elastic.analyzed_field() } elastic = Elastic(index_name) elastic.create_index(mappings, force=True) keys = list(set(list(entity_cat.keys()) + list(entity_type.keys()))) docs = {} count = 1 for key in keys: entity = "<dbpedia:" + key + ">" type_tmp = entity_type.get(key, []) type = [] for t in type_tmp: if t.startswith( "<http://dbpedia.org/ontology"): # only keep ontology type tmp = t.split(">")[0].rsplit("/")[-1] type.append(tmp) cat = entity_cat.get(key, []) cat_a = [] for c in cat: # prepare analyzed version cat_a.append(c.replace("_", " ")) type_a = [] for t in type: type_a.append( convert_from_camelcase(t) ) # e.g., camelcase "MeanOfTransportation" => "Mean Of Transportation" # print('TTTT',type) doc = { "type_n": type, "type_a": type_a, "category_n": cat, "category_a": cat_a } docs[entity] = doc if len(docs) == 10000: print("-------", count) count += 1 elastic.add_docs_bulk(docs) docs = {} elastic.add_docs_bulk(docs) print("Finish now")
def main(): index_name = "toy_index" mappings = { "title": Elastic.analyzed_field(), "content": Elastic.analyzed_field(), } docs = { 1: { "title": "Rap God", "content": "gonna, gonna, Look, I was gonna go easy on you and not to hurt your feelings" }, 2: { "title": "Lose Yourself", "content": "Yo, if you could just, for one minute Or one split second in time, forget everything Everything that bothers you, or your problems Everything, and follow me" }, 3: { "title": "Love The Way You Lie", "content": "Just gonna stand there and watch me burn But that's alright, because I like the way it hurts" }, 4: { "title": "The Monster", "content": [ "gonna gonna I'm friends with the monster", "That's under my bed Get along with the voices inside of my head" ] }, 5: { "title": "Beautiful", "content": "Lately I've been hard to reach I've been too long on my own Everybody has a private world Where they can be alone" } } elastic = Elastic(index_name) elastic.create_index(mappings, force=True) elastic.add_docs_bulk(docs)
def table_index(): index_name = "table_index_frt" mappings = { "entity_n": Elastic.notanalyzed_field(), "entity": Elastic.analyzed_field(), "data": Elastic.analyzed_field(), "caption": Elastic.analyzed_field(), "headings_n": Elastic.notanalyzed_field(), "headings": Elastic.analyzed_field(), "pgTitle": Elastic.analyzed_field(), "catchall": Elastic.analyzed_field(), } elastic = Elastic(index_name) elastic.create_index(mappings, force=True) tables = {} # todo: map ur data into a json; see above docs = {} for table_id, table in tables.items(): caption = table.get("caption") headings = label_replace(table.get("heading")) pgTitle = table.get("pgTitle") entity = table.get("entity") data = table.get("data") catcallall = " ".join([caption, json.dumps(data), pgTitle, headings]) docs[table_id] = { "entity_n": entity, "entity": entity, "data": data, "caption": caption, "headings_n": headings, "headings": headings, "pgTitle": pgTitle, "catchall": catcallall } elastic.add_docs_bulk(docs)
def build_wiki_table_index(table_file, index_name="table_index_wikitable_train_jan_13"): mappings = { "core_entity_n": Elastic.notanalyzed_field(), "all_entity_n": Elastic.notanalyzed_field(), "data": Elastic.analyzed_field(), "caption": Elastic.analyzed_field(), 'secTitle': Elastic.analyzed_field(), "headings_n": Elastic.notanalyzed_field(), "headings": Elastic.analyzed_field(), "pgTitle": Elastic.analyzed_field(), "catchall": Elastic.analyzed_field(), } elastic = Elastic(index_name) elastic.create_index(mappings, force=True) with open(table_file, 'r') as f: docs = {} for line in tqdm(f): table = json.loads(line.strip()) table_id = table.get("_id", "") pgTitle = table.get("pgTitle", "").lower() secTitle = table.get("sectionTitle", "").lower() caption = table.get("tableCaption", "").lower() headers = table.get("processed_tableHeaders", []) rows = table.get("tableData", {}) entity_columns = table.get("entityColumn", []) headers = [headers[j] for j in entity_columns] entity_cells = np.array(table.get("entityCell",[[]])) core_entities = [] num_rows = len(rows) entities = [] for i in range(num_rows): for j in entity_columns: if entity_cells[i,j] == 1: entity = rows[i][j]['surfaceLinks'][0]['target']['id'] if entity == "": continue entities.append(entity) if j == 0: core_entities.append(entity) catcallall = " ".join([pgTitle, secTitle, caption, " ".join(headers)]) docs[table_id] = { "all_entity_n": core_entities, "core_entity_n": core_entities, "caption": caption, 'secTitle': secTitle, "headings_n": headers, "headings": headers, "pgTitle": pgTitle, "catchall": catcallall } if len(docs) == 10000: elastic.add_docs_bulk(docs) docs = {} elastic.add_docs_bulk(docs)
def main(): index_name = "collectiontsv2pure" mappings = { "content": Elastic.analyzed_field(), } duparray = [] # with open('duplicates.txt', 'w') as f: # for item in duparray: # f.write("%s\n" % item) alldocs = {} elastic = Elastic(index_name) # elastic.delete_index() es = Elasticsearch(hosts=ELASTIC_HOSTS, timeout=30, max_retries=10, retry_on_timeout=True) num = 0 with open("F:\\treccar.csv", encoding='utf-8') as tsvfile: tsvreader = csv.reader(tsvfile, delimiter=",") i = 0 for line in tsvreader: doc = {} i = i + 1 if (line[1] == '' or i < 2750000): continue doc['content'] = line[1] alldocs[line[0]] = doc if ((i % 50000) == 0): elastic.add_docs_bulk(alldocs) # print(alldocs) alldocs.clear() print(i) if (es.indices.exists(index_name)): elastic.add_docs_bulk(alldocs) print(" index updated") else: elastic.create_index(mappings) elastic.add_docs_bulk(alldocs) print("new index created")
def build_wiki_category_index(category_file, index_name="wikipedia_category"): mappings = { "category_n": Elastic.notanalyzed_field(), "category_a": Elastic.analyzed_field() } elastic = Elastic(index_name) elastic.create_index(mappings, force=True) docs = {} count = 0 with open(category_file, 'r') as f: for line in tqdm(f): wiki_id, cat = json.loads(line.strip()) cat_a = [] for c in cat: # prepare analyzed version cat_a.append(c.replace("_", " ")) doc = {"category_n": cat, "category_a": cat_a} docs[wiki_id] = doc if len(docs) == 10000: elastic.add_docs_bulk(docs) docs = {} elastic.add_docs_bulk(docs)