Beispiel #1
0
def extract_document_name(file_name):
    """
    Extracts document name from its id
    :param file_name:
    :return:
    """
    lines = reader.read_file_line('data/458/ids/' + file_name)
    name = converter.convert_to_latin(lines[1])
    return name.split(':')[0]
Beispiel #2
0
def extract_link_names(file_path):
    lines = reader.read_file_line(file_path)
    ret_val = []
    for line in lines:
        if line.endswith('\r\n'):
            ret_val.append(line[:-2])
        else:
            ret_val.append(line)
    return ret_val
Beispiel #3
0
def load_doc_num(file_path):
    lines = reader.read_file_line(file_path)
    doc_num_mapper = {}
    num_doc_mapper = {}
    clean_lines = [line[:-2] for line in lines if line.endswith('\r\n')]
    clean_lines.append(lines[-1])
    for line in clean_lines:
        number = int(line.split(',')[0])
        text = line[len(str(number)) + 1:]
        doc_num_mapper[text] = number
        num_doc_mapper[number] = text
    return doc_num_mapper, num_doc_mapper
Beispiel #4
0
def extract_file_references(file_path):
    lines = reader.read_file_line(file_path)
    clean_lines = [line[:-2] for line in lines if line.endswith('\r\n')]
    #clean_lines.append(lines[-1][:-2])
    srcs = []
    dists = []
    for line in clean_lines:
        splits = line.split('\t\t\t')
        srcs.append(splits[0])
        dists.append(splits[1])

    return clean_lines, srcs, dists
Beispiel #5
0
def extract_document_links(file_name):
    """
    Extracts linked documents for given document
    :param file_name:
    :return:
    """
    destination_file = 'data/latin/links/' + file_name
    if is_file_in_directory(file_name, 'data/458/links'):
        all_lines = reader.read_file_line('data/458/links/' + file_name)
        new_lines = []
        for line in all_lines:
            latin_line = converter.convert_to_latin(line)
            new_lines.append(latin_line.split(':')[0])
        writer.write_list_of_lines(destination_file, new_lines)