Ejemplo n.º 1
0
def start_crawl():
    global url_queue
    index.create_index()
    index.fetch_cached_urls()

    url_cnt = index.url_count()
    if url_cnt == 0:
        url_queue.put((4, {SEED_URL_1: {'level': 0}}))
        url_queue.put((3, {SEED_URL_2: {'level': 0}}))
        url_queue.put((2, {SEED_URL_3: {'level': 0}}))
        url_queue.put((1, {SEED_URL_4: {'level': 0}}))
    else:
        for i in range(0,100):
            rand_index = randint(0,url_cnt)
            url = index.get_url(rand_index)
            url_queue.put((-1, {url: {'level': 0}}))

    threads = []

    for x in range(4):
        t = threading.Thread(target=crawl_queue)
        threads.append(t)

    for t in threads:
        t.start()

    for t in threads:
        t.join()

    return
Ejemplo n.º 2
0
def create_table(name, attribute, PK):
    catalog.init_catalog()
    catalog.exist_table(name, True)
    pidx = [x[0] for x in attribute].index(PK)
    if len(attribute[pidx]) != 5 or attribute[pidx][-1] != 1:
        raise Exception('Primary key is not a unique attribute!')
    catalog.create_table(name, attribute, PK)
    record.create_table(name)
    index.create_table(name, PK)
    for x in attribute:
        if PK not in x and len(x) == 5 and x[-1] == 1:
            index.create_index(name, 'Uni_' + x[0], x[0])
    catalog.finalize()
Ejemplo n.º 3
0
def main():

    path_read_docs_index = "/home/jessica/Documents/data/IN_104-Projet-Informatique/clean_docs/"
    path_save_index_folder = "/home/jessica/Documents/data/IN_104-Projet-Informatique/index/"
    index_name = "index_IN104"
    num_docs_index = 50000
    number_docs_result_search = 5

    lst_index_docs = glob.glob(path_read_docs_index + '*.txt')

    pool = Pool(8)
    schema = Schema(path=TEXT(stored=True),
                    content=TEXT(analyzer=StemmingAnalyzer()))
    ix = create_index(path_save_index_folder, index_name, num_docs_index,
                      lst_index_docs, schema, pool)
    searcher = ix.searcher()
    parser_query = QueryParser("content", schema=schema, group=qparser.OrGroup)

    user_query = "types of cancer in the human body"
    # Object
    query = Query(user_query)
    docs_result = query.get_query(parser_query, searcher,
                                  number_docs_result_search)

    for doc in docs_result:
        print(doc)
Ejemplo n.º 4
0
def create_index(new_idex, if_str_command):
    if catalog_manager.check_index(new_idex.table_name,
                                   new_idex.attribute_name) != 0:
        if (if_str_command):
            ret = "Index already exists!\n"
            return ret
        print("Index already exists!")
        return
    catalog_manager.create_index(new_idex)
    temp = record_manager.select_record_with_Index(new_idex.table_name, 0, [])
    cnt = catalog_manager.get_attribute_cnt(new_idex.table_name,
                                            new_idex.attribute_name)
    list = []
    for i in temp:
        list.append(i[cnt])
    # print(new_idex.index_id,list)
    index.create_index(new_idex.index_id, list)
Ejemplo n.º 5
0
def create_index(tname, iname, iattr):
    '''record.init()
    catalog.init_catalog()
    index.init_index()'''
    catalog.exist_index(iname, True)
    catalog.create_index(tname, iname, iattr)
    res = record.create_index(tname,
                              catalog.get_index_of_attribute(tname, iattr),
                              catalog.get_type_of_attribute(tname, iattr),
                              catalog.get_length(tname))
    try:
        index.create_index(tname, iname, res)
    except Exception as e:
        raise Exception(
            'Entries sharing same key on the column that is creating index on!'
        )
    '''index.finalize_index()
Ejemplo n.º 6
0
def delete_all(table_name, if_str_command):
    ind = catalog_manager.get_index(table_name)
    for i in ind:
        index.drop_index(i.index_id)
        index.create_index(i.index_id, [])
    record_manager.clear_table(table_name)
Ejemplo n.º 7
0
def create_table(dict, if_str_command):
    prim_index = dict['pri_index']
    index.create_index(prim_index.index_id, [])
    table = dict['new_table']
    catalog_manager.create_table(table, prim_index)
    record_manager.create_table(dict['table_name'])
Ejemplo n.º 8
0
import json

from whoosh.index import open_dir

from index import create_index, INDEX_NAME
from schema import to_fields, SCHEMA

create_index(INDEX_NAME, SCHEMA)

filename = './data/relevant_Tables_working.json'

index = open_dir(INDEX_NAME)
writer = index.writer()

with open(filename, 'r') as file:
    data = json.load(file)
    for identifier in data:
        print(identifier)
        table = data[identifier]
        fields = to_fields(identifier, table)
        writer.add_document(**fields)
writer.commit()
Ejemplo n.º 9
0
    "first_name": "Arvind",
    "last_name": "ds",
    "gender": "M",
    "age": "16",
    "type": "person"
}
person_node_2 = {
    "id": "Afdg",
    "first_name": "Ara",
    "last_name": "ds",
    "gender": "M",
    "age": "16",
    "type": "person"
}
person_edges = {"source": "afds", "destination": "afdg", "type": "friend"}
'''
start_time = time()
index.create_index("node_index10", "test_type2", node_properties_2, zeusdb_temp.NODE_TABLE)
end_time = time()
print "Create Node Index total time : " + str(end_time - start_time)
'''
#start_time = time()
#index.create_index("edge_index17", "test_type2", edge_properties_2, zeusdb_temp.EDGE_TABLE)
#end_time = time()
#print "Create Edge Index total time : " + str(end_time - start_time)

#zeusdb_temp.create_node(person_node, True)
#zeusdb_temp.create_node(person_node_1, True)
#zeusdb_temp.create_edge(props, True)

#start_time = time()
Ejemplo n.º 10
0
def create_index(tname, iname, iattr):
    catalog.init_catalog()
    catalog.exist_index(tname, iname, True)
    index.create_index(tname, iname, iattr)
    catalog.create_index(tname, iname, iattr)
    catalog.finalize()
Ejemplo n.º 11
0
# if answer in ['Y', 'y', 'YES', 'yes']:
# 	es_setup.delete_cluster()
# 	es_setup.clear_cache()


#######################################
#			Create Index
#######################################
es = Elasticsearch(ES_URL)
res = index.get_index(es)

if res == False:
	db = db_setup.oracle_connection()
	db_setup.xml_cursor(db, es)
	index.create_index(es)


#######################################
#			Kibana Configuration
#######################################
kibana_setup.clear_cache()
kibana_setup.connect_kibana()
kibana_setup.map_config()
kibana_setup.map_index_pattern()
kibana_setup.set_default_index_pattern()
kibana_setup.set_config()


#######################################
#			Build Dashboards
import shutil

from preprocess import main as preprocess
from index import main as create_index
from tfidf import main as tfidf
from classifier import main as create_classifier
from server import main as start_server

if __name__ == '__main__':
    shutil.rmtree('resources/dataset/rumoureval-data/random-rumours', True)
    print('running preprocess')
    preprocess()
    print('creating index')
    create_index()
    print('calculating tfidfs')
    tfidf()
    print('starting server')
    start_server()