def table_index():
    index_name = "table_index_frt"
    mappings = {
        "entity_n": Elastic.notanalyzed_field(),
        "entity": Elastic.analyzed_field(),
        "data": Elastic.analyzed_field(),
        "caption": Elastic.analyzed_field(),
        "headings_n": Elastic.notanalyzed_field(),
        "headings": Elastic.analyzed_field(),
        "pgTitle": Elastic.analyzed_field(),
        "catchall": Elastic.analyzed_field(),
    }
    elastic = Elastic(index_name)
    elastic.create_index(mappings, force=True)
    tables = {}  # todo: map ur data into a json; see above
    docs = {}
    for table_id, table in tables.items():
        caption = table.get("caption")
        headings = label_replace(table.get("heading"))
        pgTitle = table.get("pgTitle")
        entity = table.get("entity")
        data = table.get("data")
        catcallall = " ".join([caption, json.dumps(data), pgTitle, headings])
        docs[table_id] = {
            "entity_n": entity,
            "entity": entity,
            "data": data,
            "caption": caption,
            "headings_n": headings,
            "headings": headings,
            "pgTitle": pgTitle,
            "catchall": catcallall
        }
    elastic.add_docs_bulk(docs)
Exemple #2
0
def build_wiki_table_index(table_file, index_name="table_index_wikitable_train_jan_13"):
    mappings = {
        "core_entity_n": Elastic.notanalyzed_field(),
        "all_entity_n": Elastic.notanalyzed_field(),
        "data": Elastic.analyzed_field(),
        "caption": Elastic.analyzed_field(),
        'secTitle': Elastic.analyzed_field(),
        "headings_n": Elastic.notanalyzed_field(),
        "headings": Elastic.analyzed_field(),
        "pgTitle": Elastic.analyzed_field(),
        "catchall": Elastic.analyzed_field(),
    }
    elastic = Elastic(index_name)
    elastic.create_index(mappings, force=True)
    with open(table_file, 'r') as f:
        docs = {}
        for line in tqdm(f):
            table = json.loads(line.strip())
            table_id = table.get("_id", "")
            pgTitle = table.get("pgTitle", "").lower()
            secTitle = table.get("sectionTitle", "").lower()
            caption = table.get("tableCaption", "").lower()
            headers = table.get("processed_tableHeaders", [])
            rows = table.get("tableData", {})
            entity_columns = table.get("entityColumn", [])
            headers = [headers[j] for j in entity_columns]
            entity_cells = np.array(table.get("entityCell",[[]]))
            core_entities = []
            num_rows = len(rows)
            entities = []

            for i in range(num_rows):
                for j in entity_columns:
                    if entity_cells[i,j] == 1:
                        entity = rows[i][j]['surfaceLinks'][0]['target']['id']
                        if entity == "":
                            continue
                        entities.append(entity)
                        if j == 0:
                            core_entities.append(entity)
            catcallall = " ".join([pgTitle, secTitle, caption, " ".join(headers)])
            docs[table_id] = {
                "all_entity_n": core_entities,
                "core_entity_n": core_entities,
                "caption": caption,
                'secTitle': secTitle,
                "headings_n": headers,
                "headings": headers,
                "pgTitle": pgTitle,
                "catchall": catcallall
            }
            if len(docs) == 10000:
                elastic.add_docs_bulk(docs)
                docs = {}
        elastic.add_docs_bulk(docs)
def cat_type_index():
    f1 = open("entity_category.json", "r")
    entity_cat = json.load(f1)
    f2 = open("entity_type.json", "r")
    entity_type = json.load(f2)
    index_name = "dbpedia_2015_10_type_cat"
    mappings = {
        "type_n": Elastic.notanalyzed_field(),
        "type_a": Elastic.analyzed_field(),
        "category_n": Elastic.notanalyzed_field(),
        "category_a": Elastic.analyzed_field()
    }
    elastic = Elastic(index_name)
    elastic.create_index(mappings, force=True)
    keys = list(set(list(entity_cat.keys()) + list(entity_type.keys())))
    docs = {}
    count = 1
    for key in keys:
        entity = "<dbpedia:" + key + ">"
        type_tmp = entity_type.get(key, [])
        type = []
        for t in type_tmp:
            if t.startswith(
                    "<http://dbpedia.org/ontology"):  # only keep ontology type
                tmp = t.split(">")[0].rsplit("/")[-1]
                type.append(tmp)

        cat = entity_cat.get(key, [])
        cat_a = []
        for c in cat:  # prepare analyzed version
            cat_a.append(c.replace("_", " "))
        type_a = []
        for t in type:
            type_a.append(
                convert_from_camelcase(t)
            )  # e.g., camelcase "MeanOfTransportation" => "Mean Of Transportation"

        # print('TTTT',type)
        doc = {
            "type_n": type,
            "type_a": type_a,
            "category_n": cat,
            "category_a": cat_a
        }
        docs[entity] = doc
        if len(docs) == 10000:
            print("-------", count)
            count += 1
            elastic.add_docs_bulk(docs)
            docs = {}
    elastic.add_docs_bulk(docs)
    print("Finish now")
def main():
    index_name = "toy_index"

    mappings = {
        "title": Elastic.analyzed_field(),
        "content": Elastic.analyzed_field(),
    }

    docs = {
        1: {
            "title":
            "Rap God",
            "content":
            "gonna, gonna, Look, I was gonna go easy on you and not to hurt your feelings"
        },
        2: {
            "title":
            "Lose Yourself",
            "content":
            "Yo, if you could just, for one minute Or one split second in time, forget everything Everything that bothers you, or your problems Everything, and follow me"
        },
        3: {
            "title":
            "Love The Way You Lie",
            "content":
            "Just gonna stand there and watch me burn But that's alright, because I like the way it hurts"
        },
        4: {
            "title":
            "The Monster",
            "content": [
                "gonna gonna I'm friends with the monster",
                "That's under my bed Get along with the voices inside of my head"
            ]
        },
        5: {
            "title":
            "Beautiful",
            "content":
            "Lately I've been hard to reach I've been too long on my own Everybody has a private world Where they can be alone"
        }
    }

    elastic = Elastic(index_name)
    elastic.create_index(mappings, force=True)
    elastic.add_docs_bulk(docs)
Exemple #5
0
def build_wiki_category_index(category_file, index_name="wikipedia_category"):
    mappings = {
        "category_n": Elastic.notanalyzed_field(),
        "category_a": Elastic.analyzed_field()
    }
    elastic = Elastic(index_name)
    elastic.create_index(mappings, force=True)
    docs = {}
    count = 0
    with open(category_file, 'r') as f:
        for line in tqdm(f):
            wiki_id, cat = json.loads(line.strip())
            cat_a = []
            for c in cat:  # prepare analyzed version
                cat_a.append(c.replace("_", " "))
            doc = {"category_n": cat, "category_a": cat_a}
            docs[wiki_id] = doc
            if len(docs) == 10000:
                elastic.add_docs_bulk(docs)
                docs = {}
    elastic.add_docs_bulk(docs)
Exemple #6
0
def main():
    index_name = "collectiontsv2pure"

    mappings = {
        "content": Elastic.analyzed_field(),
    }

    duparray = []

    # with open('duplicates.txt', 'w') as f:
    #     for item in duparray:
    #         f.write("%s\n" % item)
    alldocs = {}
    elastic = Elastic(index_name)
    # elastic.delete_index()
    es = Elasticsearch(hosts=ELASTIC_HOSTS,
                       timeout=30,
                       max_retries=10,
                       retry_on_timeout=True)
    num = 0
    with open("F:\\treccar.csv", encoding='utf-8') as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter=",")
        i = 0
        for line in tsvreader:
            doc = {}
            i = i + 1
            if (line[1] == '' or i < 2750000):
                continue

            doc['content'] = line[1]
            alldocs[line[0]] = doc

            if ((i % 50000) == 0):
                elastic.add_docs_bulk(alldocs)
                # print(alldocs)
                alldocs.clear()
                print(i)

    if (es.indices.exists(index_name)):
        elastic.add_docs_bulk(alldocs)
        print(" index updated")

    else:
        elastic.create_index(mappings)
        elastic.add_docs_bulk(alldocs)
        print("new index created")