コード例 #1
0
def loop_topic_data(fileID=None,
                    model_number=None,
                    dir_path=parser.get('final_stage',
                                        'loop_topic_data_dir_path'),
                    dest_dir=parser.get('final_stage',
                                        'loop_topic_data_dest_dir')):
    global score_data
    if not os.path.exists(dest_dir):
        os.mkdir(dest_dir)
    # r=root, d=directories, f = files
    for r, d, f in os.walk(dir_path, topdown=True):
        if r.split(os.path.sep)[-1] == 'topic_class':
            continue
        if (model_number is not None and r.split(os.path.sep)[-1]
                == model_number) or (model_number is None):
            current_model = r.split(os.path.sep)[-1]
            output_dir_path = os.path.join(dest_dir, r.split(os.path.sep)[-1])
            if not os.path.exists(output_dir_path):
                os.mkdir(output_dir_path)
            print(r)
            if current_model == 'textData':
                continue
            for file in f:
                file_id = file.split('_')[0]
                if (fileID is not None and file.split('_')[0]
                        == str(fileID)) or (fileID is None):
                    file_path_dir = os.path.join(output_dir_path, file_id)
                    if not os.path.exists(file_path_dir):
                        os.mkdir(file_path_dir)
                    print(os.path.join(r, file))
                    score_data = sr.load_data(file[:-10] + '.txt',
                                              current_model)
                    process_topic_file(os.path.join(r, file), file_path_dir)
                    create_json_repr(file_id, file_path_dir,
                                     r.split(os.path.sep)[-1])
コード例 #2
0
 def loop_discourse_results_one_file(
     topic_number,
     fileId,
     mode='LDA',
     discourse_output_dir=parser.get(
         'LDA_process',
         'loop_discourse_results_one_file_discourse_output_dir')):
     results_dict = dict(
     )  # each file processed and list of all the topics detected
     for discourse_results in os.listdir(discourse_output_dir):
         # If the current file is a dir => continue
         if os.path.isdir(
                 os.path.join(discourse_output_dir, discourse_results)):
             continue
         # If the current file is not he fileID we looking for => continue
         if discourse_results.split('_')[0] != fileId:
             continue
         # If the current file has the FAILED_PARSE or FAILED_SEG we cannot process => continue
         if 'strip_output' in discourse_results and 'FAILED_PARSE' not in discourse_results and 'FAILED_SEG' not in discourse_results:
             results_dict[discourse_results[:-17]] = list()
             text_tuples = list()
             classify_discourse_tree(discourse_results,
                                     results_dict[discourse_results[:-17]],
                                     discourse_output_dir,
                                     topic_number,
                                     text_tuples,
                                     mode=mode)
     if mode == 'LDA':
         write_stats_to_file(results_dict, topic_number + 1)
     elif mode == 'HLDA':
         write_stats_to_file_hlda(results_dict)
コード例 #3
0
def loop_discourse_results(
    topic_number,
    mode='LDA',
    discourse_output_dir=parser.get(
        'LDA_process',
        'loop_discourse_results_one_file_discourse_output_dir')):
    results_dict = dict()
    counter = 0
    for discourse_results in os.listdir(discourse_output_dir):
        if os.path.isdir(os.path.join(discourse_output_dir,
                                      discourse_results)):
            continue
        if 'strip_output' in discourse_results and 'FAILED_PARSE' not in discourse_results and 'FAILED_SEG' not in discourse_results:
            results_dict[discourse_results[:-17]] = list()
            text_tuples = list()
            classify_discourse_tree(discourse_results,
                                    results_dict[discourse_results[:-17]],
                                    discourse_output_dir,
                                    topic_number,
                                    text_tuples,
                                    mode=mode)
    if mode == 'LDA':
        write_stats_to_file(results_dict, topic_number + 1)
    elif mode == 'HLDA':
        write_stats_to_file_hlda(results_dict)
コード例 #4
0
def run_dataSet():
    DATA_PATH = parser.get('summarizerWS', 'DATA_PATH')
    queue = [file[:-4] for file in os.listdir(DATA_PATH)]

    for file in queue:
        print(f'{bcolors.OKGREEN}Processing: {file}{bcolors.ENDC}')
        create_summary_NSxTI(fileId=int(file), modelNumber=10)
コード例 #5
0
def create_summary(file_id, des_dir=parser.get('summarizerWS', 'des_dir')):
    file_data, topic_score = tr.score_fileID(file_id)
    count_words(file_data)
    new_summary = build_summary(file_data, topic_score)
    with open(os.path.join(des_dir,
                           str(file_id) + '.txt'), 'w') as output_file:
        output_file.writelines(new_summary)
コード例 #6
0
def createXmlDocument_v2(text,
                         fileName,
                         path=parser.get('xmlTree',
                                         'createXmlDocument_v2_path')):
    sections_type = init_section_structure()
    if not os.path.isdir('output'):
        os.mkdir('output')
    if not os.path.isdir(path):  # Checks if directory exists
        os.mkdir(path)
    section = None
    data = ET.Element('data')
    for sent in text:
        if sent.startswith('<H-'):
            section_exists = False
            sent_strip, section_id = strip_tags(sent, 'H')
            for elem in data:
                if len(elem.attrib) != 0 and elem.attrib[
                        'name'] == sections_type[section_id]:
                    section_exists = True
                    section = elem  # updates section variable to the element found so the sentences will be added to the section exists
                    break
            if section_exists:
                continue
            section = ET.SubElement(data, 'section')

            section.set('name', sections_type[section_id])
            section.text = sent_strip
        if sent.startswith('<S>'):
            if section is None:
                sent = strip_tags(sent, 'S')
                sentence = ET.SubElement(data, 'S')
                sentence.text = sent
            else:
                sent = strip_tags(sent, 'S')
                sentence = ET.SubElement(section, 'S')
                sentence.text = sent
        if sent.startswith('<OneItem>'):
            if section is None:
                sent = strip_tags(sent, 'OneItem')
                sentence = ET.SubElement(data, 'OneItem')
                sentence.text = sent
            else:
                sent = strip_tags(sent, 'OneItem')
                sentence = ET.SubElement(section, 'OneItem')
                sentence.text = sent
    with open(path + fileName + ".xml", "wb") as xmlWriter:
        newData = ET.tostring(data)
        newData = re.sub(
            b'[\x00-\x08|\x0B|\x0C|\x0E-\x1F|\x7F-\x84|\x86-\x9F]', b'',
            newData)
        xmlWriter.write(newData)
    return path + fileName + ".xml"
コード例 #7
0
def reset_project():
    with open('reset_project_paths.txt', 'r') as rst_path:
        paths = rst_path.readlines()
        paths = ['rm ' + line[:-1] for line in paths]
        choice1 = input(
            '[WARNING!] This will delete all the data for the project!\nAre you sure? [to continue type yes]\n'
        )
        if choice1.lower() == 'yes':
            choice2 = input('Verify again! type yes!\n')
            if choice2.lower() == 'yes!':
                for path in paths:
                    print(f'Deleting data: {path}')
                    subprocess.run(path, shell=True)
                discourse_output = parser.get(
                    'LDA_process',
                    'loop_discourse_results_one_file_discourse_output_dir'
                )  # from project_paths.ini
                discourse_output1 = discourse_output + '/*'
                discourse_output2 = discourse_output + '/Nucleus/*'
                print('Deleting data: ' + discourse_output1)
                subprocess.run('rm ' + discourse_output1, shell=True)
                print('Deleting data: ' + discourse_output2)
                subprocess.run('rm ' + discourse_output2, shell=True)
                discourse_input = parser.get('main_pipeline', 'discourseInput')
                discourse_input1 = discourse_input + '*'
                discourse_input2 = ' '.join([
                    'rm', '-r',
                    discourse_input1.replace('xml/', 'xmlParse/')
                ])
                print('Deleting data: ' + discourse_input1)
                subprocess.run('rm ' + discourse_input1, shell=True)
                print('Deleting data: ' + discourse_input2)
                subprocess.run(discourse_input2, shell=True)
            else:
                print('Aborted')
        else:
            print('Aborted')
コード例 #8
0
def read_xml_file(filename_path,
                  filename,
                  target_dir=parser.get('xmlTree',
                                        'read_xml_file_target_dir')):
    stats_counter = {
        'name': filename,
        'words': 0,
        'sentences': 0,
        'sections': 0,
        'wordsPerSection': 0
    }

    if not os.path.isdir(target_dir):
        os.mkdir(target_dir)
    tree = ET.parse(filename_path)
    root = tree.getroot()

    with open(os.path.join(target_dir, filename), mode='w') as txtXml:
        for elem in root:
            if elem.tag == 'S':
                txtXml.write(elem.text + '\n')
                print(elem.text)
                # Stat
                stats_counter['sentences'] += 1
                stats_counter['words'] += len(elem.text.strip().split())
            elif elem.tag == 'section':
                txtXml.write(elem.attrib['name'] + '\n')
                print(elem.attrib['name'])
                # Stat
                stats_counter['sections'] += 1
                stats_counter['words'] += len(
                    elem.attrib['name'].strip().split())
            elif elem.tag == 'OneItem':
                txtXml.write(elem.text + '\n')
                print(elem.text)
                stats_counter['words'] += len(elem.text.strip().split())
            for subelem in elem:
                txtXml.write(subelem.text + '\n')
                print(subelem.text)
                stats_counter['words'] += len(subelem.text.strip().split())
                stats_counter['wordsPerSection'] += len(
                    subelem.text.strip().split())

    # Write to stat file
    with open('data_set_statistics.csv', mode='a') as statFile:
        fieldNames = list(stats_counter.keys())
        writer = csv.DictWriter(statFile, fieldnames=fieldNames)
        writer.writerow(stats_counter)
コード例 #9
0
def check_files(proc_list,
                discourse_output_path=parser.get('cluster_eval',
                                                 'discourse_output_path')):
    """
    Makes sure that there files in the output dir that correspond with the proc_list
    If files are missing, removes the file name from the proc_list
    """
    # Make list of all files in the output dir
    output_list = list()
    for file in os.listdir(discourse_output_path):
        if file.split('_')[0] not in output_list:
            output_list.append(file.split('_')[0] + '.txt')
    # Check difference
    diff = list(set(proc_list) - set(output_list))
    # remove from proc_list file names that not been processed
    for filename in diff:
        proc_list.pop(proc_list.index(filename))
コード例 #10
0
def import_regex(path=parser.get('util_functions', 'import_regex_path')):
    """
    Imports as list of regex expressions from file
    :param path: path to regex file
    :return: list of regex expressions in lower state
    """
    if not os.path.isfile(path):
        raise FileNotFoundError(
            'File Not exists, supply full path with extension')
    else:
        regex_arguments = list()
        with open(path, mode='r') as regex_file:
            csv_reader = csv.reader(regex_file, delimiter=',')
            for row in csv_reader:
                regex_arguments.append(row[2])
            regex_arguments.pop(0)
        return regex_arguments
コード例 #11
0
def show_case(file_id, model_number, topic_words=None):
    from show_case.show_case_functions import run_show_case
    show_case_url = run_show_case(
        file_id,
        model_number,
        original_text_dir=parser.get('main_pipeline', 'original_text_dir'),
        xml_processed_dir=parser.get('main_pipeline', 'xml_processed_dir'),
        xml_parse_dir=parser.get('main_pipeline', 'xml_parse_dir'),
        topic_class_dir=parser.get('main_pipeline', 'topic_class_dir'),
        trees_dir=parser.get('main_pipeline', 'trees_dir'),
        final_stage_dir=parser.get('main_pipeline', 'final_stage_dir'),
        topic_data=convert_topics_for_show_case(model_number))
    return show_case_url
コード例 #12
0
def third_stage(file_id, models=None):
    from LDA_process import loop_models_one_file
    if isinstance(models, list):  # It means CLI MODE
        models[1]["file_id"] = str(file_id)
        loop_models_one_file(models[0],
                             **models[1])  # 0 - Models, 1 - Topic Paths
    else:
        loop_models_one_file(
            models,
            file_id=str(file_id),
            topic_4_model=parser.get('main_pipeline', 'topic_4_model'),
            topic_4_data_dir=parser.get('main_pipeline', 'topic_4_data_dir'),
            topic_6_model=parser.get('main_pipeline', 'topic_6_model'),
            topic_6_data_dir=parser.get('main_pipeline', 'topic_6_data_dir'),
            topic_10_model=parser.get('main_pipeline', 'topic_10_model'),
            topic_10_data_dir=parser.get('main_pipeline', 'topic_10_data_dir'),
            hdp_model=parser.get('main_pipeline', 'hdp_model'))
コード例 #13
0
def classify_discourse_tree(filename,
                            results_dict_list,
                            dis_dir,
                            topicNumber,
                            text_tuples,
                            mode='LDA',
                            script_path=parser.get(
                                'LDA_process',
                                'classify_discourse_tree_script_path')):
    dir_path = os.path.join('output/topic_class', str(topicNumber))
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    print(bcolors.OKGREEN + 'Classifying topics for file: ' + filename +
          bcolors.ENDC)
    file_name = filename[:-4]
    file_path = os.path.join(dis_dir, filename)
    parsing_output = subprocess.check_output(
        ['python2', script_path, file_path])
    parsed_json = json.loads(parsing_output)
    sent_counter = 0
    sent_counter = recursive_read_text(parsed_json,
                                       results_dict_list,
                                       sent_counter,
                                       text_tuples,
                                       mode=mode)
    # with open(os.path.join('output/topic_class', file_name + '_topic.txt'), mode='w') as output:
    with open(os.path.join(dir_path, file_name + '_topic.txt'),
              mode='w') as output:
        json.dump(parsed_json, output)
    # write file data to pickle item
    pickle_path = os.path.join(dir_path, 'textData')
    if not os.path.exists(pickle_path):
        os.mkdir(pickle_path)
    with open(os.path.join(pickle_path, filename + '.pickle'),
              mode='wb') as pickle_file:
        pickle.dump(text_tuples, pickle_file)
コード例 #14
0
def write_text_to_file(fileId, newSummary):
    OUTPUT_SUMMARY_PATH = Path(
        parser.get('summarizerWS', 'OUTPUT_SUMMARY_PATH'))
    with open(OUTPUT_SUMMARY_PATH / (str(fileId) + '_NuVec_NI_TI.txt'),
              mode='w') as outFile:
        outFile.writelines(newSummary)
コード例 #15
0
def cluster_eval(topic_number, log_path=None):
    """# Get list of all processed files
    if log_path is None:
        print(f'{bcolors.FAIL} Please provide log path {bcolors.ENDC}')
        exit(1)
    # proc_files = processed_list('logs/D29_04_20M17_37.txt')
    proc_files = processed_list(log_path)
    print(proc_files)
    check_files(proc_files)"""
    # Process all the files that finished discourse parsing
    # third stage & four stage
    # Can be uncommented if running files separately - Stages 1+2 and then 3+4
    """
    for file_id in proc_files:
        f_id = int(file_id[:-4])  # remove .txt and convert to int value
        if config_file is None:
            # run stages
            mp.third_stage(f_id, {'4': True, '6': False, '10': False, 'hdp': False})
            mp.fourth_stage(f_id)
            mp.show_case(f_id, 4)
        else:
            argument_list = [
                config_file['third_stage']['models'],
                config_file['third_stage']['models_path']
            ]
            mp.third_stage(f_id, argument_list)  # Working on all of the models
            # Fourth stage
            # mp.fourth_stage(f_id)
            for key in config_file['third_stage']['models'].keys():
                if config_file['third_stage']['models'][key] is True:
                    mp.fourth_stage(f_id, key)
                    mp.show_case(f_id, int(key))
    
    input('Finished processing - press space+enter')
    """

    nucleus_path = os.path.join(parser.get('cluster_eval', 'evaluation_list'),
                                os.path.join(str(topic_number), 'nucleus'))
    proc_files = evaluation_list(nucleus_path)

    OTHER_SYSTEM = '/home/tzvi/PycharmProjects/HSdataprocessLinux/summarizerWS/summaries'

    create_clustering_corpus(parser.get('cluster_eval', 'original_data'),
                             parser.get('cluster_eval', 'cluster_folder'),
                             file_name='corpus.csv')

    cluster_folder_path = parser.get('cluster_eval', 'cluster_folder')
    print('Finished processing the data\nStarting to cluster....')
    start_time = time.time()
    # Calculate clustering for the current <!-original-!> files
    print(f'{bcolors.WARNING}Clustering original files{bcolors.ENDC}')
    truth, original_M = run_kmeans(
        os.path.join(cluster_folder_path, 'corpus.csv'),
        os.path.join(cluster_folder_path, 'output_plots'), proc_files,
        'clustering_report.txt')
    run_time = time.time() - start_time
    print(f'{bcolors.OKBLUE} --- {run_time} seconds --- {bcolors.ENDC}')

    print(f'{bcolors.WARNING}Creating new corpus{bcolors.ENDC}')
    # Create new clustering corpus from the processed files
    create_clustering_corpus(OTHER_SYSTEM,
                             cluster_folder_path,
                             file_name='after_corpus.csv',
                             customSystemTag='')

    start_time = time.time()
    print(f'{bcolors.WARNING}Clustering for processed files{bcolors.ENDC}')
    # Calculate clustering for the current <!-processed-!> files
    predict, predict_M = run_kmeans(
        os.path.join(cluster_folder_path, 'after_corpus.csv'),
        os.path.join(cluster_folder_path, 'after_output_plots'), proc_files,
        'after_clustering_report.txt')
    run_time = time.time() - start_time
    print(f'{bcolors.OKBLUE} --- {run_time} seconds --- {bcolors.ENDC}')

    report = create_confusion_matrix(truth, predict)

    with open('after_clustering_report.txt', 'a') as report_file:
        for key in report.keys():
            report_file.write('\n' + key + '\n' + str(report[key]))

    report_df = pd.DataFrame([original_M, predict_M, report],
                             index=['Texts', 'Summaries', 'report'])
    report_df.to_excel("eval_report.xlsx")
コード例 #16
0
        sg.Text(
            '- Identify sections in the text\n- Create xml file\n- Create new text file based on xml file created',
            font=('Helvetica', 11))
    ],
    [
        sg.Text('Input file path', size=(15, 1), justification='center'),
        sg.Input(disabled=True, size=(40, 1), key='filePathBrowse'),
        sg.FileBrowse()
    ],
    [
        sg.Text('Discourse input\nfolder path',
                size=(15, 2),
                justification='center'),
        sg.Input(disabled=True,
                 size=(40, 1),
                 default_text=parser.get('main_pipeline', 'discourseInput'),
                 key='discoursePathBrowse'),
        sg.FileBrowse()
    ],
    [
        sg.Button('Process', key='process_button'),
        sg.Button('Next Stage', key='nextStage_button', visible=False)
    ]
]
# endregion

# region ----- Second Stage layout ------------------
second_stage_layout = [
    [
        sg.Text('Second Stage\n--- Discourse Parsing ---',
                size=(30, 2),
コード例 #17
0
    def decide_topic_2(self,
                       tree_height,
                       mode=parser.get('nucleus_weight_approach', 'mode')):
        """
        Decides the unit overall topic number
        Mode:
            NS - Nucleus Importance maximum approach
            V - vector approach mode
        """
        def mul_vector(val, vector):
            return [(item[0], val * item[1]) for item in vector]

        unit_topics = dict()
        # Weights
        W_HEIGHT = 0.5
        W_POSITION = 0.3
        W_NS = 0.2
        if len(self.leaf_nodes):
            for leaf_node in self.leaf_nodes.values():
                # F1 - height
                F1 = (tree_height + 1 - leaf_node.tree_depth) / (tree_height +
                                                                 1)
                # F2 - position
                F2 = 1 if leaf_node.position.right else 0
                # F3 - nucleus / satellite
                F3 = 1 if leaf_node.node_class == 'N' else 0
                # NS
                NS = W_HEIGHT * F1 + W_POSITION * F2 + W_NS * F3
                leaf_node.ni_score = NS
                if mode is 'V':
                    self.unit_topic_vector = mul_vector(
                        NS,
                        self.unit_topic_vector if len(self.unit_topic_vector)
                        else leaf_node.topic_vector)
                    unit_vector_topic_class = max(self.unit_topic_vector,
                                                  key=lambda t: t[1])
                if leaf_node.topic_class not in unit_topics:
                    unit_topics[leaf_node.topic_class] = NS
                else:
                    unit_topics[leaf_node.topic_class] += NS

                # region Write data to pickle file
                if not os.path.exists(file_name):
                    with open(file_name, mode='wb') as dataFile:
                        cur_tup = [
                            (leaf_node.node_number_L, leaf_node.node_number_R,
                             leaf_node.text, leaf_node.topic_class,
                             leaf_node.ni_score, leaf_node.node_class,
                             leaf_node.topic_vector)
                        ]
                        pickle.dump(cur_tup, dataFile)
                else:
                    # Read the pickle file, append data and write again
                    with open(file_name, mode='rb') as dataFile:
                        cur_tup = pickle.load(dataFile)
                    with open(file_name, mode='wb') as dataFile:
                        new_tup = (leaf_node.node_number_L,
                                   leaf_node.node_number_R, leaf_node.text,
                                   leaf_node.topic_class, leaf_node.ni_score,
                                   leaf_node.node_class,
                                   leaf_node.topic_vector)
                        if new_tup not in cur_tup:
                            cur_tup.append(
                                (leaf_node.node_number_L,
                                 leaf_node.node_number_R, leaf_node.text,
                                 leaf_node.topic_class, leaf_node.ni_score,
                                 leaf_node.node_class, leaf_node.topic_vector))
                        pickle.dump(cur_tup, dataFile)
                # endregion

            N_max = max(unit_topics.values())
            topic_max = find_max(unit_topics, N_max)
            if mode == 'V':
                # Mode V
                self.unit_topic = unit_vector_topic_class[0]
                self.unit_topic_score = unit_vector_topic_class[1]
            else:
                # Mode NS
                self.unit_topic = topic_max
                self.unit_topic_score = N_max
コード例 #18
0
def first_stage(inputFile_path,
                discourseInput=parser.get('main_pipeline', 'discourseInput')):
    import preprocess

    preprocess.pre_process_single_file(inputFile_path, discourseInput)
コード例 #19
0
def second_stage(xml_result_path,
                 discourse_script_path=parser.get('main_pipeline',
                                                  'discourse_script_path')):
    import os
    os.system(' '.join(['python2', discourse_script_path, xml_result_path]))
コード例 #20
0
def create_model_dict(model):
    model_dict = dict()
    model_data = model.print_topics()
    print(model_data)
    for t in model_data:
        model_dict[t[0]] = ' | '.join([
            item.strip()[6:] for item in t[1].replace('"', '').split('+')[0:4]
        ])
    return model_dict


# Load models
topic_labels = dict()
topic4 = gensim.models.ldamodel.LdaModel.load(
    parser.get('final_stage', 'load_4topic'))
topic4 = create_model_dict(topic4)
topic_labels['4'] = topic4
topic6 = gensim.models.ldamodel.LdaModel.load(
    parser.get('final_stage', 'load_6topic'))
topic6 = create_model_dict(topic6)
topic_labels['6'] = topic6
topic10 = gensim.models.ldamodel.LdaModel.load(
    parser.get('final_stage', 'load_10topic'))
topic10 = create_model_dict(topic10)
topic_labels['10'] = topic10

pp = pprint.PrettyPrinter()


# region <------------------------------ Base tree struct functions --------------------------------------------->
コード例 #21
0
import LDA_classifier as classifier
from print_colors import bcolors
from project_config import parser


# Change working directory
def change_working_dir():
    work_str = Path(os.getcwd())
    os.chdir(work_str.parent)


# change_working_dir()

# Directory that contains all the CSV files for each file with topic/text data
# OUTPUT_FINAL_STAGE_PATH = 'output/final_stage'
OUTPUT_FINAL_STAGE_PATH = parser.get('summarizerWS', 'OUTPUT_FINAL_STAGE_PATH')

# region Model loading
MODEL_PATH = parser.get('summarizerWS', 'MODEL_PATH')
DATA_DIR = parser.get('summarizerWS', 'DATA_DIR')

# MODEL_PATH = '/home/tzvi/PycharmProjects/HSdataprocessLinux/gensim_models/10topics/lda_model_trained_10topics.model'
# DATA_DIR = '/home/tzvi/PycharmProjects/HSdataprocessLinux/gensim_files/10Topic'


# Working with 10 topic model
def load_LDA_model(topic_model, topic_data_dir):
    classifier.load_Model_local(topic_model)
    classifier.load_data_local(topic_data_dir)
    classifier.print_model_topics()
    print(f'{bcolors.HEADER} \t [!!]\tLDA model loaded!\t[!!] {bcolors.ENDC}')