Beispiel #1
0
def get_ir_variable(text):
    global global_exp_dict
    ir_variables = {}
    time_query = 0.0
    testToolI = IndexToolManager(indexName=str(authorprof_db_name),
                                 top_k=global_exp_dict['ir_top_k'])
    initial = None
    final = None
    if (global_exp_dict['tool'] == 'arango'):
        initial = time.time()
        ir_variables = testToolI.arango_get_IR_variables(
            text,
            'male',
            ignore_first_result=global_exp_dict['ignore_first_result'])
        final = time.time()
    elif (global_exp_dict['tool'] == 'elastic'):
        initial = time.time()
        ir_variables = testToolI.elastic_get_IR_variables(
            text,
            'male',
            ignore_first_result=global_exp_dict['ignore_first_result'])
        final = time.time()
    elif (global_exp_dict['tool'] == 'zettair'):
        initial = time.time()
        ir_variables = testToolI.zettair_get_IR_variables(
            text,
            'male',
            ignore_first_result=global_exp_dict['ignore_first_result'])
        final = time.time()
    # print(ir_variables)
    time_query = float(final - initial)
    return ir_variables, time_query
Beispiel #2
0
def measure_TIME_INDEX_old():

    mylogger.info('START OF TIME_INDEX MEASUREMENTS')
    mylogger.info(str(datetime.datetime.now()))

    initial = time.time()

    mylogger.info('CLEANING DATABASES')
    testTool = IndexToolManager(indexName=authorprof_db_name)
    testTool.clean_default()
    final = time.time()
    mylogger.info(f'CLEANING FINISHED: {final - initial}')
    mylogger.info('')
    mylogger.info('DB_AUTHORPROF')
    mylogger.info('')
    index_DB_AUTHORPROF_TOOL_ARANGO()
    mylogger.info('')
    index_bulk_DB_AUTHORPROF_TOOL_ARANGO()
    mylogger.info('')
    index_DB_AUTHORPROF_TOOL_ELASTIC()
    mylogger.info('')
    index_bulk_DB_AUTHORPROF_TOOL_ELASTIC()
    mylogger.info('')
    index_bulk_DB_AUTHORPROF_TOOL_ZETTAIR()
    mylogger.info('')

    mylogger.info('')
    mylogger.info('DB_BOTGENDER')
    mylogger.info('')
    index_DB_BOTGENDER_TOOL_ARANGO()
    mylogger.info('')
    index_bulk_DB_BOTGENDER_TOOL_ARANGO()
    mylogger.info('')
    index_DB_BOTGENDER_TOOL_ELASTIC()
    mylogger.info('')
    index_bulk_DB_BOTGENDER_TOOL_ELASTIC()
    mylogger.info('')
    index_bulk_DB_BOTGENDER_TOOL_ZETTAIR()
    mylogger.info('')

    mylogger.info('')
    mylogger.info('DB_HYPERPARTISAN')
    mylogger.info('')
    index_DB_HYPERPARTISAN_TOOL_ARANGO()
    mylogger.info('')
    index_bulk_DB_HYPERPARTISAN_TOOL_ARANGO()
    mylogger.info('')
    index_DB_HYPERPARTISAN_TOOL_ELASTIC()
    mylogger.info('')
    index_bulk_DB_HYPERPARTISAN_TOOL_ELASTIC()
    mylogger.info('')
    index_bulk_DB_HYPERPARTISAN_TOOL_ZETTAIR()
    mylogger.info('')

    mylogger.info(str(datetime.datetime.now()))
    mylogger.info('END OF TIME_INDEX MEASUREMENTS')
Beispiel #3
0
def index_bulk_DB_HYPERPARTISAN_TOOL_ELASTIC():
    initial = time.time()

    testTool = IndexToolManager(indexName=str(hyperpartisan_db_name + '_bulk'))

    start = time.time()
    bulk = testTool.get_documents_DB_HYPERPARTISAN(
        hyperpartisan_articles_xml, hyperpartisan_ground_truth_xml)
    end = time.time()
    mylogger.info(f'get_documents_DB_HYPERPARTISAN {end - start}')
    mylogger.info(f'TOTAL documents DB_HYPERPARTISAN  {len(bulk)}')

    start = time.time()
    bulkBody = testTool.bulkInsertGeneratorElastic(bulk)
    end = time.time()
    mylogger.info(f'bulkInsertGeneratorElastic {end - start}')

    start = time.time()
    testTool.bulkElastic(bulkBody)
    end = time.time()
    mylogger.info(f'bulkElastic {end - start}')

    start = time.time()
    testTool.refreshElastic()
    end = time.time()
    mylogger.info(f'refreshElastic {end - start}')

    final = time.time()

    mylogger.info(
        f'index_bulk_DB_HYPERPARTISAN_TOOL_ELASTIC: {final - initial}')
Beispiel #4
0
def index_bulk_DB_AUTHORPROF_TOOL_ELASTIC():
    initial = time.time()

    testTool = IndexToolManager(indexName=str(authorprof_db_name + '_bulk'))

    start = time.time()
    bulk = testTool.get_documents_DB_AUTHORPROF(authorprof_xml_folder,
                                                authorprof_truth_txt)
    end = time.time()
    mylogger.info(f'get_documents_DB_AUTHORPROF {end - start}')
    mylogger.info(f'TOTAL documents DB_AUTHORPROF {len(bulk)}')

    start = time.time()
    bulkBody = testTool.bulkInsertGeneratorElastic(bulk)
    end = time.time()
    mylogger.info(f'bulkInsertGeneratorElastic {end - start}')

    start = time.time()
    testTool.bulkElastic(bulkBody)
    end = time.time()
    mylogger.info(f'bulkElastic {end - start}')

    start = time.time()
    testTool.refreshElastic()
    end = time.time()
    mylogger.info(f'refreshElastic {end - start}')

    final = time.time()

    mylogger.info(f'index_bulk_DB_AUTHORPROF_TOOL_ELASTIC: {final - initial}')
Beispiel #5
0
def index_bulk_DB_HYPERPARTISAN_TOOL_ZETTAIR():
    initial = time.time()

    testTool = IndexToolManager(indexName=str(hyperpartisan_db_name + '_bulk'))

    start = time.time()
    bulk = testTool.get_documents_DB_HYPERPARTISAN(
        hyperpartisan_articles_xml, hyperpartisan_ground_truth_xml)
    end = time.time()
    mylogger.info(f'get_documents_DB_HYPERPARTISAN {end - start}')
    mylogger.info(f'TOTAL documents DB_HYPERPARTISAN  {len(bulk)}')

    start = time.time()
    testTool.saveToTrecFileZettair(bulk)
    end = time.time()
    mylogger.info(f'saveToTrecFileZettair {end - start}')

    start = time.time()
    testTool.zettair_index()
    end = time.time()
    mylogger.info(f'zettair_index {end - start}')

    final = time.time()

    mylogger.info(
        f'index_bulk_DB_HYPERPARTISAN_TOOL_ZETTAIR: {final - initial}')
Beispiel #6
0
def index_DB_HYPERPARTISAN_TOOL_ELASTIC():
    initial = time.time()

    testTool = IndexToolManager(indexName=hyperpartisan_db_name)

    start = time.time()
    bulk = testTool.get_documents_DB_HYPERPARTISAN(
        hyperpartisan_articles_xml, hyperpartisan_ground_truth_xml)
    end = time.time()
    mylogger.info(f'get_documents_DB_HYPERPARTISAN {end - start}')
    mylogger.info(f'TOTAL documents DB_HYPERPARTISAN  {len(bulk)}')

    start = time.time()
    for doc in bulk:
        testTool.insertElastic(doc.pop('id'), doc)
    end = time.time()
    mylogger.info(f'for-loop insertElastic {end - start}')

    start = time.time()
    testTool.refreshElastic()
    end = time.time()
    mylogger.info(f'refreshElastic {end - start}')

    final = time.time()

    mylogger.info(f'index_DB_HYPERPARTISAN_TOOL_ELASTIC: {final - initial}')
Beispiel #7
0
def index_bulk_DB_BOTGENDER_TOOL_ZETTAIR():
    initial = time.time()

    testTool = IndexToolManager(indexName=str(botgender_db_name + '_bulk'))

    start = time.time()
    bulk = testTool.get_documents_DB_BOTGENDER(botgender_xml_folder,
                                               botgender_truth_txt)
    end = time.time()
    mylogger.info(f'get_documents_DB_BOTGENDER {end - start}')
    mylogger.info(f'TOTAL documents DB_BOTGENDER  {len(bulk)}')

    start = time.time()
    testTool.saveToTrecFileZettair(bulk)
    end = time.time()
    mylogger.info(f'saveToTrecFileZettair {end - start}')

    start = time.time()
    testTool.zettair_index()
    end = time.time()
    mylogger.info(f'zettair_index {end - start}')

    final = time.time()

    mylogger.info(f'index_bulk_DB_BOTGENDER_TOOL_ZETTAIR: {final - initial}')
Beispiel #8
0
def index_DB_BOTGENDER_TOOL_ELASTIC():
    initial = time.time()

    testTool = IndexToolManager(indexName=botgender_db_name)

    start = time.time()
    bulk = testTool.get_documents_DB_BOTGENDER(botgender_xml_folder,
                                               botgender_truth_txt)
    end = time.time()
    mylogger.info(f'get_documents_DB_BOTGENDER {end - start}')
    mylogger.info(f'TOTAL documents DB_BOTGENDER  {len(bulk)}')

    start = time.time()
    for doc in bulk:
        testTool.insertElastic(doc.pop('id'), doc)
    end = time.time()
    mylogger.info(f'for-loop insertElastic {end - start}')

    start = time.time()
    testTool.refreshElastic()
    end = time.time()
    mylogger.info(f'refreshElastic {end - start}')

    final = time.time()

    mylogger.info(f'index_DB_BOTGENDER_TOOL_ELASTIC{final - initial}')
Beispiel #9
0
def index_DB_AUTHORPROF_TOOL_ELASTIC():
    initial = time.time()

    testTool = IndexToolManager(indexName=authorprof_db_name)

    start = time.time()
    bulk = testTool.get_documents_DB_AUTHORPROF(authorprof_xml_folder,
                                                authorprof_truth_txt)
    end = time.time()
    mylogger.info(f'get_documents_DB_AUTHORPROF {end - start}')
    mylogger.info(f'TOTAL documents DB_AUTHORPROF {len(bulk)}')

    start = time.time()
    for doc in bulk:
        testTool.insertElastic(doc.pop('id'), doc)
    end = time.time()
    mylogger.info(f'for-loop insertElastic {end - start}')

    start = time.time()
    testTool.refreshElastic()
    end = time.time()
    mylogger.info(f'refreshElastic {end - start}')

    final = time.time()

    mylogger.info(f'index_DB_AUTHORPROF_TOOL_ELASTIC: {final - initial}')
Beispiel #10
0
def measure_TIME_INDEX(normal=False, clean=False):
    mylogger.info('START OF TIME_INDEX MEASUREMENTS')
    exp_id = str(datetime.datetime.now())
    mylogger.info(exp_id)

    initial = time.time()

    if (clean):
        mylogger.info('CLEANING DATABASES')
        testTool = IndexToolManager(indexName=authorprof_db_name)
        testTool.clean_default()
        final = time.time()
        mylogger.info(f'CLEANING FINISHED: {final - initial}')

    tools = ['arango', 'elastic', 'zettair']
    # tools = ['zettair']
    dbs = [
        'authorprof', 'botgender', 'hyperpartisan', 'hyperpartisan_split_42'
    ]
    dbs = ['authorprof', 'hyperpartisan', 'hyperpartisan_split_42']
    # dbs = ['authorprof']

    for db in dbs:
        mylogger.info('')
        mylogger.info('DB_' + db)
        for tool in tools:
            if (normal and tool != 'zettair'):
                index(idx_type='normal',
                      db=db,
                      tool=tool,
                      db_name=db,
                      exp_id=exp_id)
            index(idx_type='bulk',
                  db=db,
                  tool=tool,
                  db_name=str(db + '_bulk'),
                  exp_id=exp_id)

    mylogger.info(str(datetime.datetime.now()))
    mylogger.info('END OF TIME_INDEX MEASUREMENTS')
Beispiel #11
0
def index_DB_AUTHORPROF_TOOL_ARANGO():
    initial = time.time()

    testTool = IndexToolManager(indexName=authorprof_db_name)

    start = time.time()
    bulk = testTool.get_documents_DB_AUTHORPROF(authorprof_xml_folder,
                                                authorprof_truth_txt)
    end = time.time()
    mylogger.info(f'get_documents_DB_AUTHORPROF {end - start}')
    mylogger.info(f'TOTAL documents DB_AUTHORPROF {len(bulk)}')

    start = time.time()
    documentList = testTool.bulkListGeneratorArango(bulk)
    end = time.time()
    mylogger.info(f'bulkListGeneratorArango {end - start}')

    start = time.time()
    for doc in documentList:
        testTool.insertDocumentArango(doc)
    end = time.time()
    mylogger.info(f'for-loop insertDocumentArango {end - start}')

    final = time.time()

    mylogger.info(f'index_DB_AUTHORPROF_TOOL_ARANGO: {final - initial}')
Beispiel #12
0
def index_bulk_DB_BOTGENDER_TOOL_ARANGO():
    initial = time.time()

    testTool = IndexToolManager(indexName=str(botgender_db_name + '_bulk'))

    start = time.time()
    bulk = testTool.get_documents_DB_BOTGENDER(botgender_xml_folder,
                                               botgender_truth_txt)
    end = time.time()
    mylogger.info(f'get_documents_DB_BOTGENDER {end - start}')
    mylogger.info(f'TOTAL documents DB_BOTGENDER  {len(bulk)}')

    start = time.time()
    documentList = testTool.bulkListGeneratorArango(bulk)
    end = time.time()
    mylogger.info(f'bulkListGeneratorArango {end - start}')

    start = time.time()
    testTool.bulkImportArango(documentList)
    end = time.time()
    mylogger.info(f'bulkImportArango {end - start}')

    final = time.time()

    mylogger.info(f'index_bulk_DB_BOTGENDER_TOOL_ARANGO: {final - initial}')
Beispiel #13
0
def index_bulk_DB_HYPERPARTISAN_TOOL_ARANGO():
    initial = time.time()

    testTool = IndexToolManager(indexName=str(hyperpartisan_db_name + '_bulk'))

    start = time.time()
    bulk = testTool.get_documents_DB_HYPERPARTISAN(
        hyperpartisan_articles_xml, hyperpartisan_ground_truth_xml)
    end = time.time()
    mylogger.info(f'get_documents_DB_HYPERPARTISAN {end - start}')
    mylogger.info(f'TOTAL documents DB_HYPERPARTISAN  {len(bulk)}')

    start = time.time()
    documentList = testTool.bulkListGeneratorArango(bulk)
    end = time.time()
    mylogger.info(f'bulkListGeneratorArango {end - start}')

    start = time.time()
    testTool.bulkImportArango(documentList)
    end = time.time()
    mylogger.info(f'bulkImportArango {end - start}')

    final = time.time()

    mylogger.info(
        f'index_bulk_DB_HYPERPARTISAN_TOOL_ARANGO: {final - initial}')
Beispiel #14
0
def index_bulk_DB_BOTGENDER_TOOL_ELASTIC():
    initial = time.time()

    testTool = IndexToolManager(indexName=str(botgender_db_name + '_bulk'))

    start = time.time()
    bulk = testTool.get_documents_DB_BOTGENDER(botgender_xml_folder,
                                               botgender_truth_txt)
    end = time.time()
    mylogger.info(f'get_documents_DB_BOTGENDER {end - start}')
    mylogger.info(f'TOTAL documents DB_BOTGENDER  {len(bulk)}')

    # start = time.time()
    # bulkBody = testTool.bulkInsertGeneratorElastic(bulk)
    # end = time.time()
    # mylogger.info(f'bulkInsertGeneratorElastic {end - start}')

    start = time.time()
    bulkBody = testTool.bulkHelperInsertGeneratorElastic(bulk)
    end = time.time()
    mylogger.info(f'bulkHelperInsertGeneratorElastic {end - start}')

    # start = time.time()
    # testTool.bulkElastic(bulkBody)
    # end = time.time()
    # mylogger.info(f'bulkElastic {end - start}')

    start = time.time()
    testTool.bulkHelperElastic(bulkBody)
    end = time.time()
    mylogger.info(f'bulkHelperElastic {end - start}')

    start = time.time()
    testTool.refreshElastic()
    end = time.time()
    mylogger.info(f'refreshElastic {end - start}')

    final = time.time()

    mylogger.info(f'index_bulk_DB_BOTGENDER_TOOL_ELASTIC: {final - initial}')
Beispiel #15
0
def load_pan_data(xmls_directory,
                  truth_path,
                  write_to_txt_files=False,
                  txts_destination_directory=None,
                  exp_dict={},
                  exec_type='training'):
    """Load PAN data

    This function loads the PAN dataset and the truth, parses the XML and returns:
    Merged tweets of the authors, the truth, Author IDs, and the original length of the tweets.
    It also writes the tweets to TXT files (optional).

    Args:
        xmls_directory: The directory where the XML files of the dataset reside.
        truth_path: The path of the truth file.
        write_to_txt_files: (boolean) If True, the XML files will also be written as TXT files after being parsed.
        txts_destination_directory: The TXT files will be written to this directory.

    Returns:
        merged_tweets_of_authors: List. Each item is all of the tweets of an author, merged into one string.
            Refer to the list of replacements in the remarks.
        truths: List of truths for authors.
        author_ids: List of Author IDs.
        original_tweet_lengths: List of original tweet lengths.

    Raises:
        RuntimeError: If a non-XML file exists inside the *xmls_directory*

    Remarks:
        - Since *xml_filenames* is sorted in ascending order, all the returned lists will also be in the same order
        (sorted in ascending order of the Author IDs).
        - List of replacements:
            Line feed		<LineFeed>
            End of Tweet	<EndOfTweet>
    """
    ''' 
    *os.listdir* returns a list containing the name of all files and folders in the given directory.
    Normally, the list is created in ascending order. However, the Python documentation states,
    “the list is in arbitrary order”.
    To ensure consistency and avoid errors in syncing the order of the items among
    different lists (e.g., *author_ids*, *truths*), we sort the list by calling *sorted*.
    *sorted()* returns a new sorted list (in ascending lexicographical order) of all the items in an iterable.
    '''
    global global_exp_dict
    global testTool
    global_exp_dict = exp_dict.copy()
    testTool = IndexToolManager(indexName=str(authorprof_db_name),
                                top_k=global_exp_dict['ir_top_k'])
    xml_filenames = sorted(os.listdir(xmls_directory))

    if exec_type == 'testing':
        global_exp_dict['ignore_first_result'] = False
    # Store the Author IDs in a list
    # The Author IDs list will have the same order as the XML filenames list.
    author_ids = []  # Create an empty list
    for xml_filename in xml_filenames:
        author_ids.append(xml_filename[:-4])

    # Skip loading truth if path input is None. Else, load the truth from the file.
    if truth_path is None:
        logger.info("*truth_path* is None => Skipped loading the truth")
        truths = None
        # This scenario will happen when loading the test dataset for **TIRA** evaluation, where the truth of the test
        # set is not provided.
    else:
        truths = load_truth(truth_path, author_ids)

    if write_to_txt_files:
        logger.info("The parsed XMLs will also be written to TXT files.")
        # Create the directory if it does not exist.
        os.makedirs(txts_destination_directory, exist_ok=True)

    # Initialize the lists.
    # The lists will have the same order as the XML filenames list (refer to: “Iterate over XML Files”)
    original_tweet_lengths = []  # Create an empty list
    # ↳ Every row will represent an author, every column will represent a tweet.
    merged_tweets_of_authors = []  # Create an empty list
    # ↳ Each cell will contain all 100 tweets of an author, merged.

    ir_vars_of_authors = []

    time_query_list = []
    time_query = 0.0

    # Iterate over XML files
    for author_index, xml_filename in enumerate(xml_filenames):
        # Make sure only XML files go through
        if not fnmatch.fnmatch(xml_filename, '*.xml'):
            logger.error(
                "Encountered a non-XML file inside the directory: %s >>> The program will now exit.",
                xml_filename)
            raise RuntimeError(
                'Encountered a non-XML file inside the directory: %s' %
                xml_filename)
            # ↳ This is printf-style String Formatting.

        # Read the XML file and parse it into a tree
        # Parser is explicitly defined to ensure UTF-8 encoding.
        tree = ElementTree.parse(
            os.path.join(xmls_directory, xml_filename),
            parser=ElementTree.XMLParser(encoding="utf-8"))
        root = tree.getroot()
        '''
        root is the root element of the parsed tree
        root[0], ..., root[m-1] are the children of root—elements one level below the root.
        root[0][0], ..., root[0][n-1] are the children of root[0].
        and so on.

        Each element has a tag, a dictionary of attributes, and sometimes some text:
            root[i][j].tag, ”.attrib, ”.text
        '''

        # Add an empty new row to the list. Each row represents an author.
        original_tweet_lengths.append([])

        # Initialize the list. Note that this list resets in every author (XML file) loop.
        tweets_of_this_author = []  # Create an empty list

        author_id = xml_filename[:-4]
        # print(int(author_id, base=16))
        # print(int('eb151ca9c0e31d615dd8c335bdbc9226', base=16))
        # if int(author_id, base=16) < int('4502f17f7a9d88f6a9594e82968740b0', base=16):
        #     continue
        ir_variables_of_this_author = []
        text_list = []
        # doc_id = 1

        logger.info(
            f'{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")} Author: {author_id}'
        )
        # Iterate over the tweets within this parsed XML file:
        # Record the tweet length, replace line feeds, and append the tweet to a list
        for child in root[0]:
            # Element.text accesses the element's text content,
            # which is saved with the following format in the XML files: <![CDATA[some text]]>
            tweet = child.text
            text = tweet
            original_tweet_lengths[author_index].append(len(tweet))

            # Replace line feed (LF = \n) with “ <LineFeed> ”
            # Note: There were no carriage return (CR = \r) characters in any of the 3,000 XML files.
            tweet = tweet.replace('\n', " <LineFeed> ")

            # Create a list of the tweets of this author, to write to a text file and merge, after the loop terminates.
            '''
            Google Python Style Guide: Avoid using the + and += operators to accumulate a string within a loop.
            Since strings are immutable, this creates unnecessary temporary objects and results in quadratic rather
            than linear running time.
            Avoid: merged_tweets_of_authors[author_index] += tweet + " <EndOfTweet> "
            Instead, append each substring to a list and ''.join the list after the loop terminates.
            '''
            tweets_of_this_author.append(tweet)
            text_list.append(text)
            # ir_variables = {}
            # if (exp_dict['add_ir_variables']):
            #     initial = None
            #     final = None
            #     if (exp_dict['tool'] == 'arango'):
            #         initial = time.time()
            #         ir_variables = testTool.arango_get_IR_variables(
            #             text, 'male')
            #         final = time.time()
            #     elif (exp_dict['tool'] == 'elastic'):
            #         initial = time.time()
            #         ir_variables = testTool.elastic_get_IR_variables(
            #             text, 'male')
            #         final = time.time()
            #     elif (exp_dict['tool'] == 'zettair'):
            #         initial = time.time()
            #         ir_variables = testTool.zettair_get_IR_variables(
            #             text, 'male')
            #         final = time.time()
            #     time_query_list.append(float(final-initial))

            # ir_variables_of_this_author.append(ir_variables)
        if (global_exp_dict['add_ir_variables']):
            ir_variables_of_this_author, tq_list = get_ir_variables(text_list)
            # ir_variables_of_this_author = [{
            #     'CLASS_0_BM25_AVG': 0,
            #     'CLASS_0_BM25_COUNT': 0,
            #     'CLASS_0_BM25_SUM': 0,
            #     'CLASS_1_BM25_AVG': 0,
            #     'CLASS_1_BM25_COUNT': 0,
            #     'CLASS_1_BM25_SUM': 0,
            # }]
            time_query_list.extend(tq_list)
            # doc_id += 1
        ir_vars_dict = []
        if (global_exp_dict['add_ir_variables']):
            ir_vars = pd.DataFrame(ir_variables_of_this_author)
            ir_vars_mean = ir_vars.mean()
            ir_vars_sum = ir_vars.sum()

            # ir_vars_dict = {}
            # ir_vars_dict = {
            #     'CLASS_0_BM25_AVG': ir_vars_mean['CLASS_0_BM25_AVG'],
            #     'CLASS_0_BM25_COUNT': ir_vars_sum['CLASS_0_BM25_COUNT'],
            #     'CLASS_0_BM25_SUM': ir_vars_sum['CLASS_0_BM25_SUM'],
            #     'CLASS_1_BM25_AVG': ir_vars_mean['CLASS_1_BM25_AVG'],
            #     'CLASS_1_BM25_COUNT': ir_vars_sum['CLASS_1_BM25_COUNT'],
            #     'CLASS_1_BM25_SUM': ir_vars_sum['CLASS_1_BM25_SUM'],
            # }
            ir_vars_dict = [
                ir_vars_mean['CLASS_0_BM25_AVG'],
                ir_vars_sum['CLASS_0_BM25_COUNT'],
                ir_vars_sum['CLASS_0_BM25_SUM'],
                ir_vars_mean['CLASS_1_BM25_AVG'],
                ir_vars_sum['CLASS_1_BM25_COUNT'],
                ir_vars_sum['CLASS_1_BM25_SUM'],
            ]
        # Write the tweets of this author to a TXT file
        # Note that in these tweets, the line feed characters are replaced with a tag.
        if write_to_txt_files:
            # Create a TXT file with the Author ID as the filename (same as the XML files) in the write mode
            with open(os.path.join(txts_destination_directory,
                                   author_ids[author_index] + ".txt"),
                      'w',
                      encoding="utf-8") as txt_output_file:
                txt_output_file.write('\n'.join(tweets_of_this_author))
                # ↳ '\n'.join adds a newline character between every two strings,
                # so there won't be any extra line feeds on the last line of the file.

        # Concatenate the tweets of this author, and append it to the main list
        merged_tweets_of_this_author = " <EndOfTweet> ".join(
            tweets_of_this_author) + " <EndOfTweet>"
        # ↳ " <EndOfTweet> ".join adds the tag between every two strings, so we need to add another tag to the end.
        merged_tweets_of_authors.append(merged_tweets_of_this_author)
        ir_vars_of_authors.append(ir_vars_dict)
        # print('\n\nir_vars_dict')
        # print(ir_vars_dict)

    logger.info("@ %.2f seconds: Finished loading the dataset",
                time.process_time())

    result_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S.%f")
    time_query = np.mean(time_query_list)
    # testTool.log_result(result_id, {
    #     'exp_id': exp_dict['exp_id'],
    #     'variable': 'TIME_QUERY',
    #     ** testTool.get_parameters(),
    #     'db': exp_dict['db'],
    #     'tool': exp_dict['tool'],
    #     'db_name': exp_dict['db_name'],
    #     'add_ir_variables': 'true' if exp_dict['add_ir_variables'] else 'false',
    #     'solution_number': exp_dict['solution_number'],
    #     'solution_name': exp_dict['solution_name'],
    #     'train_data_folder': exp_dict['train_data_folder'],
    #     'test_data_folder': exp_dict['test_data_folder'],
    #     'execution_type': exec_type,
    #     'number_queries': str(len(time_query_list)),
    #     'value': str(time_query),
    # })
    testTool.log_result(
        result_id, {
            'variable': 'TIME_QUERY',
            **testTool.get_parameters(),
            **global_exp_dict,
            'execution_type': exec_type,
            'number_queries': str(len(time_query_list)),
            'value': str(time_query),
        })

    return merged_tweets_of_authors, truths, author_ids, original_tweet_lengths, ir_vars_of_authors
add_ir_variables = exp_dict['add_ir_variables']
tool = exp_dict['tool']
hyperpartisan_db_name = exp_dict['db_name']
hyperpartisan_orig_db_name = 'hyperpartisan_bulk'
db = exp_dict['db']
db_name = exp_dict['db_name']

# exp_id = str(datetime.datetime.now())
exp_id = exp_dict['exp_id']
saved_model1 = ''
saved_model2 = ''
saved_model3 = ''
test_input = ''
print(exp_dict)
# exit()
testTool = IndexToolManager(
    indexName=str(hyperpartisan_db_name), top_k=exp_dict['ir_top_k'])

testToolOrig = IndexToolManager(
    indexName=str(hyperpartisan_orig_db_name), top_k=exp_dict['ir_top_k'])

def toEvaluationFormat(all_doc_ids, all_prediction):
    n_right_predictions = 0
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    evaluationFormatList = []
    for i in range(len(all_doc_ids)):
        current_doc_id = all_doc_ids[i]
        current_prob = all_prediction[i][0]
        current_doc_real_class = testToolOrig.arango_get_document(str(current_doc_id))[
Beispiel #17
0
def index(idx_type='normal',
          db='authorprof',
          tool='arango',
          db_name='authorprof',
          exp_id='unamed'):
    mylogger.info('')
    mylogger.info(f'INDEX TYPE: {idx_type}')
    mylogger.info(f'DB: {db}')
    mylogger.info(f'TOOL: {tool}')
    mylogger.info(f'DB NAME: {db_name}')
    initial = time.time()

    testTool = IndexToolManager(indexName=db_name)

    start = time.time()
    append_class_to_id = False
    if (tool == 'zettair'):
        append_class_to_id = True
    bulk = testTool.get_documents(db, db_files[db]['xml_folder'],
                                  db_files[db]['truth_txt'],
                                  append_class_to_id)
    end = time.time()
    mylogger.info(f'get_documents {end - start}')
    mylogger.info(f'TOTAL documents {len(bulk)}')

    start = time.time()
    if (tool == 'arango'):
        documentList = testTool.bulkListGeneratorArango(bulk)
        end = time.time()
        mylogger.info(f'bulkListGeneratorArango {end - start}')
        if (idx_type == 'normal'):
            start = time.time()
            for doc in documentList:
                testTool.insertDocumentArango(doc)
            end = time.time()
            mylogger.info(f'for-loop insertDocumentArango {end - start}')
        if (idx_type == 'bulk'):
            start = time.time()
            testTool.bulkImportArango(documentList)
            end = time.time()
            mylogger.info(f'bulkImportArango {end - start}')

    if (tool == 'elastic'):
        if (idx_type == 'normal'):
            start = time.time()
            for doc in bulk:
                testTool.insertElastic(doc.pop('id'), doc)
            end = time.time()
            mylogger.info(f'for-loop insertElastic {end - start}')

        if (idx_type == 'bulk'):
            start = time.time()
            bulkBody = testTool.bulkHelperInsertGeneratorElastic(bulk)
            end = time.time()
            mylogger.info(f'bulkHelperInsertGeneratorElastic {end - start}')

            start = time.time()
            testTool.bulkHelperElastic(bulkBody)
            end = time.time()
            mylogger.info(f'bulkHelperElastic {end - start}')

        start = time.time()
        testTool.refreshElastic()
        end = time.time()
        mylogger.info(f'refreshElastic {end - start}')

    if (tool == 'zettair'):
        start = time.time()
        testTool.saveToTrecFileZettair(bulk)
        end = time.time()
        mylogger.info(f'saveToTrecFileZettair {end - start}')

        start = time.time()
        testTool.zettair_index()
        end = time.time()
        mylogger.info(f'zettair_index {end - start}')

    final = time.time()
    result_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S.%f")
    testTool.log_result(
        result_id, {
            'exp_id': exp_id,
            'variable': 'TIME_INDEX',
            'index_type': idx_type,
            'db': db,
            'tool': tool,
            'db_name': db_name,
            'value': str((final - initial)),
        })
    mylogger.info(f'index TOTAL TIME: {final - initial}')