Python clean_line Beispiele, utils.clean_line Python Beispiele

Beispiel #1

0

Datei anzeigen

    def charge(self, input_voltage=0.0):
        if self.check_voltage(input_voltage):
            for percentage in range(1, 11):
                self.print_charge_percentage(percentage)

            # when charge is complete
            clean_line()
            print('Charge is complete')

Beispiel #2

0

Datei anzeigen

    def print_charge_percentage(self, percentage):
        _pretty_battery = list('[         ]')
        for i in range(1, percentage):
            _pretty_battery[i] = '='
        sleep(0.2)

        if percentage > 1:
            clean_line()

        print('Charging : ' + "".join(_pretty_battery) + '>')

Beispiel #3

0

Datei anzeigen

Datei: gameMan.py Projekt: Stendec-UA/BEE2.4

def find_steam_info(game_dir):
    """Determine the steam ID and game name of this folder, if it has one.

    This only works on Source games!
    """
    game_id = -1
    name = "ERR"
    found_name = False
    found_id = False
    for folder in os.listdir(game_dir):
        info_path = os.path.join(game_dir, folder, 'gameinfo.txt')
        if os.path.isfile(info_path):
            with open(info_path) as file:
                for line in file:
                    clean_line = utils.clean_line(line).replace('\t', ' ')
                    if not found_id and 'steamappid' in clean_line.casefold():
                        raw_id = clean_line.casefold().replace(
                            'steamappid', '').strip()
                        try:
                            game_id = int(raw_id)
                        except ValueError:
                            pass
                    elif not found_name and 'game ' in clean_line.casefold():
                        found_name = True
                        ind = clean_line.casefold().rfind('game') + 4
                        name = clean_line[ind:].strip().strip('"')
                    if found_name and found_id:
                        break
        if found_name and found_id:
            break
    return game_id, name

Beispiel #4

0

Datei anzeigen

def parse(posfile, propfile, path):
    "Parse through the given palette file to get all data."
    props = Property.parse(propfile, path + ':properties.txt')
    name = "Unnamed"
    opts = {}
    for option in props:
        if option.name == "name":
            name = option.value
        else:
            opts[option.name.casefold()] = option.value
    pos = []
    for dirty_line in posfile:
        line = utils.clean_line(dirty_line)
        if line:
            # Lines follow the form
            # "ITEM_BUTTON_FLOOR", 2
            # for subtype 3 of the button
            if line.startswith('"'):
                val = line.split('",')
                if len(val) == 2:
                    pos.append((
                        val[0][1:], # Item ID
                        int(val[1].strip()), # Item subtype
                        ))
                else:
                    print("Malformed row '"+line+"'!")
                    return None
    return Palette(name, pos, opts, filename=path)

Beispiel #5

0

Datei anzeigen

Datei: soft2hard.py Projekt: pierregodard/word_discovery

def segment_target_translation(matrix, target):
    if not target:
        matrix = [list(i) for i in zip(*matrix)]
    finalString = ""
    translation = []
    lastCol = -1
    last_alignment = ""
    for i in range(1, len(matrix)):  #for each element
        col = get_max_prob_col(i, matrix)
        aligned_word = matrix[0][col]
        if lastCol == -1:  #first character
            finalString += matrix[i][0]  #put the character in the beginning
            last_alignment = aligned_word
        elif lastCol == col:  # if the current character and the last one are not separated
            finalString += matrix[i][0]
        else:
            finalString += " " + matrix[i][0]
            translation.append(last_alignment)
            last_alignment = aligned_word
        lastCol = col
    translation.append(last_alignment)
    finalString = finalString.replace("  ", " ")
    if finalString[-1] == " ":
        finalString = finalString[:-1]
    if finalString[0] == " ":
        finalString = finalString[1:]
    discovered_words = finalString.split(" ")
    assert len(discovered_words) == len(translation)
    if discovered_words[
            -1] == utils.EOS_symbol:  #if segmented the EOS symbol, we need to remove it and its aligned translation
        discovered_words = discovered_words[:-1]
        translation = translation[:-1]
    finalString = utils.clean_line(" ".join(discovered_words))
    return finalString, " ".join(translation)

Beispiel #6

0

Datei anzeigen

Datei: paletteLoader.py Projekt: Coolasp1e/BEE2.4

def parse(posfile, propfile, path):
    "Parse through the given palette file to get all data."
    props = Property.parse(propfile, path + ':properties.txt')
    name = "Unnamed"
    opts = {}
    for option in props:
        if option.name == "name":
            name = option.value
        else:
            opts[option.name.casefold()] = option.value
    pos = []
    for dirty_line in posfile:
        line = utils.clean_line(dirty_line)
        if line:
            # Lines follow the form
            # "ITEM_BUTTON_FLOOR", 2
            # for subtype 3 of the button
            if line.startswith('"'):
                val = line.split('",')
                if len(val) == 2:
                    pos.append((
                        val[0][1:], # Item ID
                        int(val[1].strip()), # Item subtype
                        ))
                else:
                    LOGGER.warning('Malformed row "{}"!', line)
                    return None
    return Palette(name, pos, opts, filename=path)

Beispiel #7

0

Datei anzeigen

Datei: gameMan.py Projekt: xDadiKx/BEE2.4

def find_steam_info(game_dir):
    """Determine the steam ID and game name of this folder, if it has one.

    This only works on Source games!
    """
    game_id = -1
    name = "ERR"
    found_name = False
    found_id = False
    for folder in os.listdir(game_dir):
        info_path = os.path.join(game_dir, folder, 'gameinfo.txt')
        if os.path.isfile(info_path):
            with open(info_path) as file:
                for line in file:
                    clean_line = utils.clean_line(line).replace('\t', ' ')
                    if not found_id and 'steamappid' in clean_line.casefold():
                        raw_id = clean_line.casefold().replace(
                            'steamappid', '').strip()
                        try:
                            game_id = int(raw_id)
                        except ValueError:
                            pass
                    elif not found_name and 'game ' in clean_line.casefold():
                        found_name = True
                        ind = clean_line.casefold().rfind('game') + 4
                        name = clean_line[ind:].strip().strip('"')
                    if found_name and found_id:
                        break
        if found_name and found_id:
            break
    return game_id, name

Beispiel #8

0

Datei anzeigen

Datei: packageLoader.py Projekt: Stendec-UA/BEE2.4

    def parse(cls, data):
        conf = data.info.find_key('Config', '')
        mats = [prop.value for prop in data.info.find_all('AddIfMat')]
        if conf.has_children():
            # Allow having a child block to define packlists inline
            files = [prop.value for prop in conf]
        else:
            path = 'pack/' + conf.value + '.cfg'
            try:
                with data.zip_file.open(path) as f:
                    # Each line is a file to pack.
                    # Skip blank lines, strip whitespace, and
                    # alow // comments.
                    files = []
                    for line in f:
                        line = utils.clean_line(line)
                        if line:
                            files.append(line)
            except KeyError as ex:
                raise FileNotFoundError('"{}:{}" not in zip!'.format(
                    data.id,
                    path,
                )) from ex

        return cls(
            data.id,
            files,
            mats,
        )

Beispiel #9

0

Datei anzeigen

Datei: dissertation_pdfs.py Projekt: bblalock/cmu_faculty_student_net

def scrape_dissertation_page(page, alumni_list, faculty_list):
    if page['text'] == 'ERROR':
        return {
            'alumni_match': None,
            'alumni_compared_line': None,
            'faculty_matches': None
        }

    page = [
        line for line in re.split('; |, |\*|\n', page['text'])
        if line not in ['', None]
    ]

    cleaned_lines = []
    for line in page:
        line = strip_page_stop_words(line)
        if len(line) > 0:
            line = reduce(lambda a, b: a + ' ' + b, line)
            cleaned_lines.append(line)

    page = cleaned_lines
    committee_index = next(
        (i for i, line in enumerate(page) if 'committee' in clean_line(line)),
        0)

    alumni_matches = get_top_matches(page,
                                     alumni_list,
                                     normalize_func=lambda i: i)
    faculty_matches = get_top_matches(page[committee_index + 1:],
                                      faculty_list,
                                      normalize_func=lambda i: i)

    alumni_matches = [tup for tup in alumni_matches if tup[2] >= 0.90]
    faculty_matches = [tup for tup in faculty_matches if tup[2] >= 0.90]

    if len(alumni_matches):
        compared_line = reduce(
            lambda a, b: str(a) + ' ' + str(b),
            [tup[0] for tup in alumni_matches if tup[2] >= 0.90])

        alumni_matches = [tup[1] for tup in alumni_matches]
    else:
        compared_line = None
        alumni_matches = None

    if len(faculty_matches):
        faculty_matches = [
            [tup[1], 'Chair'] if
            (i == 0) or ('chair' in tup[0].lower()) else [tup[1], 'Non-chair']
            for i, tup in enumerate(faculty_matches)
        ]
    else:
        faculty_matches = None

    return {
        'alumni_match': alumni_matches,
        'alumni_compared_line': compared_line,
        'faculty_matches': faculty_matches
    }

Beispiel #10

0

Datei anzeigen

Datei: gameMan.py Projekt: SpyyZ158/BEE2.4

    def edit_gameinfo(self, add_line=False):
        """Modify all gameinfo.txt files to add or remove our line.

        Add_line determines if we are adding or removing it.
        """

        for folder in self.dlc_priority():
            info_path = os.path.join(self.root, folder, 'gameinfo.txt')
            if os.path.isfile(info_path):
                with open(info_path) as file:
                    data = list(file)

                for line_num, line in reversed(list(enumerate(data))):
                    clean_line = utils.clean_line(line)
                    if add_line:
                        if clean_line == GAMEINFO_LINE:
                            break  # Already added!
                        elif '|gameinfo_path|' in clean_line:
                            LOGGER.debug(
                                "Adding gameinfo hook to {}",
                                info_path,
                            )
                            # Match the line's indentation
                            data.insert(
                                line_num+1,
                                utils.get_indent(line) + GAMEINFO_LINE + '\n',
                                )
                            break
                    else:
                        if clean_line == GAMEINFO_LINE:
                            LOGGER.debug(
                                "Removing gameinfo hook from {}", info_path
                            )
                            data.pop(line_num)
                            break
                else:
                    if add_line:
                        LOGGER.warning(
                            'Failed editing "{}" to add our special folder!',
                            info_path,
                        )
                    continue

                with open(info_path, 'w') as file:
                    for line in data:
                        file.write(line)
        if not add_line:
            # Restore the original files!
            for name, file, ext in FILES_TO_BACKUP:
                item_path = self.abs_path(file + ext)
                backup_path = self.abs_path(file + '_original' + ext)
                old_version = self.abs_path(file + '_styles' + ext)
                if os.path.isfile(old_version):
                    LOGGER.info('Restoring Stylechanger version of "{}"!', name)
                    shutil.copy(old_version, item_path)
                elif os.path.isfile(backup_path):
                    LOGGER.info('Restoring original "{}"!', name)
                    shutil.move(backup_path, item_path)
            self.clear_cache()

Beispiel #11

0

Datei anzeigen

Datei: analyse_hierarchy_classification.py Projekt: schomper/jasper

def main():
    global NUM_TOP_TOPICS

    check_start('./test_doc_guess_algorithm.py <folder> ' +
                '<output> <num_topics>', 4)

    # Input naming
    folder = sys.argv[1]
    output = sys.argv[2]
    num_topics = int(sys.argv[3])

    topic_order_count = 0
    topic_feeling_count = 0
    topic_offset_count = 0

    # File allocation
    format_file = folder + '/initial.formatted'
    vocab_file = folder + '/initial.vocab'
    info_file = folder + '/out/final.other'

    with open(format_file) as data_file:
        format_lines = data_file.readlines()

    with open(info_file) as data_file:
            info_lines = data_file.readlines()

    NUM_TOP_TOPICS = int(info_lines[0].split(' ')[1])

    known_topics = get_known_topics(folder)
    print(str(known_topics[0]))
    word_dict = get_word_dictionary(folder, vocab_file)
    hierarchy_struct = get_hierarchy_struct(folder)

    # Cycle through top-level documents
    count = 0
    for index in range(len(format_lines)):
        line = format_lines[index]
        line = line.strip()

        # known_topics = get_known_topics()

        weights = get_document_values(index, hierarchy_struct)
        contents = line.split('|~|')[3]
        contents = clean_line(contents)
        classed_doc = classify(contents, weights, word_dict)
        classed_array = get_classed_array(classed_doc)

        if index % 1000 == 0 and index != 0:
            print('\tTopics correct position ' +
                  '%d: %s' % (index, str(topic_order_count / index)))

        if compare_topics_order(classed_array, known_topics[index], 1):
            topic_order_count += 1

        count += 1

    print('\tTopics correct position %d: %s' % (count, str(topic_order_count / count)))

Beispiel #12

0

Datei anzeigen

Datei: packageLoader.py Projekt: goodDOS/BEE2.4

    def parse(cls, data):
        conf = data.info.find_key('Config', '')
        mats = [
            prop.value
            for prop in
            data.info.find_all('AddIfMat')
        ]
        if conf.has_children():
            # Allow having a child block to define packlists inline
            files = [
                prop.value
                for prop in conf
            ]
        else:
            path = 'pack/' + conf.value + '.cfg'
            try:
                with data.zip_file.open(path) as f:
                    # Each line is a file to pack.
                    # Skip blank lines, strip whitespace, and
                    # alow // comments.
                    files = []
                    for line in f:
                        line = utils.clean_line(line)
                        if line:
                            files.append(line)
            except KeyError as ex:
                raise FileNotFoundError(
                    '"{}:{}" not in zip!'.format(
                        data.id,
                        path,
                    )
                ) from ex
        if CHECK_PACKFILE_CORRECTNESS:
            # Use normpath so sep differences are ignored, plus case.
            zip_files = {
                os.path.normpath(file).casefold()
                for file in
                zip_names(data.zip_file)
                if file.startswith('resources')
            }
            for file in files:
                #  Check to make sure the files exist...
                file = os.path.join('resources', os.path.normpath(file)).casefold()
                if file not in zip_files:
                    LOGGER.warning('Warning: "{file}" not in zip! ({pak_id})',
                        file=file,
                        pak_id=data.pak_id,
                    )

        return cls(
            data.id,
            files,
            mats,
        )

Beispiel #13

0

Datei anzeigen

Datei: gameMan.py Projekt: Stendec-UA/BEE2.4

    def edit_gameinfo(self, add_line=False):
        """Modify all gameinfo.txt files to add or remove our line.

        Add_line determines if we are adding or removing it.
        """

        for folder in self.dlc_priority():
            info_path = os.path.join(self.root, folder, 'gameinfo.txt')
            if os.path.isfile(info_path):
                with open(info_path) as file:
                    data = list(file)

                for line_num, line in reversed(list(enumerate(data))):
                    clean_line = utils.clean_line(line)
                    if add_line:
                        if clean_line == GAMEINFO_LINE:
                            break  # Already added!
                        elif '|gameinfo_path|' in clean_line:
                            print("Adding gameinfo hook to " + info_path)
                            # Match the line's indentation
                            data.insert(
                                line_num + 1,
                                utils.get_indent(line) + GAMEINFO_LINE + '\n',
                            )
                            break
                    else:
                        if clean_line == GAMEINFO_LINE:
                            print("Removing gameinfo hook from " + info_path)
                            data.pop(line_num)
                            break
                else:
                    if add_line:
                        print('Failed editing "' + info_path +
                              '" to add our special folder!')
                    continue

                with open(info_path, 'w') as file:
                    for line in data:
                        file.write(line)
        if not add_line:
            # Restore the original files!
            for name, file, ext in FILES_TO_BACKUP:
                item_path = self.abs_path(file + ext)
                backup_path = self.abs_path(file + '_original' + ext)
                old_version = self.abs_path(file + '_styles' + ext)
                if os.path.isfile(old_version):
                    print("Restoring Stylechanger version of " + name + "!")
                    shutil.copy(old_version, item_path)
                elif os.path.isfile(backup_path):
                    print("Restoring original " + name + "!")
                    shutil.move(backup_path, item_path)
            self.clear_cache()

Beispiel #14

0

Datei anzeigen

def generate_inst_dict(instance_file_path, lines_to_read, vertex_set=None, print_details=False):
    """
    :param instance_file_path: path to the instance file used to generate the dictionary;
    :param lines_to_read: number of lines to read in the file; if <= 0, read the entire file;
    :param print_details: boolean, if True print details about the state of the processing;
    :param vertex_set: if present, use a set of vertices as starting point, 
        with all the vertices being "owl#Thing";
    :return: dictionary that contains for each entity its instance type;
    """
    
    # Dictionary where the entity types are stored;
    instance_dict = {}
    
    current_line = 0
    start_time = time.time()
    with utils.read_compressed(instance_file_path) as infile:
        for line in infile:
            # Skip the header line, and skip commented-out lines;
            if current_line != 0 and line[0] != "#":
                # Clean the line and split it in 4; keep only the first 3;
                triple = utils.clean_line(line, ignore_literals=True, obtain_resource_manually=True)
                entity_name, _, entity_type = triple
 
                # Add to the dict;
                # Note: data are dirty, some entities appears more than once 
                # with different types.
                # As this problem doesn't occur often, the last type that appears is kept.
                # If the entity is already present, with type "owl#Thing",
                # the type is overwritten with something more specific.
                if len(entity_name) > 0 and len(entity_type) > 0:
                    if vertex_set is None or entity_name in vertex_set:
                        if entity_name not in instance_dict or\
                        (entity_name in instance_dict and instance_dict[entity_name] == "owl#Thing"):
                            instance_dict[entity_name] = entity_type
                
            current_line += 1

            if not current_line % 100000 and print_details:
                print("LINES READ: {} -- ELAPSED TIME: {:.2f} seconds"\
                      .format(current_line, time.time() - start_time))
                 
            # Stop reading if enough lines have been read;
            if lines_to_read > 0 and current_line > lines_to_read:
                break
            
    # Add all the other vertices;
    for v in vertex_set:
        if v not in instance_dict:
            instance_dict[v] = "owl#Thing"
    
    return instance_dict

Beispiel #15

0

Datei anzeigen

Datei: extractor.py Projekt: iamkroot/cms-search

def process_docx(file_path: Path):
    """ Extracts text from .docx files
    Args:
        file_path(Path) : Path object that contains the file_path of the .docx file
    Returns:
        list : The sentences extracted from the file
    """
    doc = Document(file_path)
    sentences = []
    for para in doc.paragraphs:
        for line in para.text.split("."):
            line = clean_line(line)
            if line:
                sentences.append(line)
    return sentences

Beispiel #16

0

Datei anzeigen

Datei: extractor.py Projekt: iamkroot/cms-search

def process_doc(file_path: Path):
    """ Extracts text from .doc files
    Args:
        file_path(Path) : Path object that contains the file_path of the .doc file
    Returns:
        list : The sentences extracted from the file
    """
    try:

        p = sp.run(["catdoc", str(file_path)], capture_output=True)
        output = p.stdout.decode()
        sentences = [clean_line(line) for line in output.split("\n\n") if line]
        return sentences
    except FileNotFoundError as e:
        print("Unable to process", file_path)
        print(e.strerror)
        return []

Beispiel #17

0

Datei anzeigen

Datei: extractor.py Projekt: iamkroot/cms-search

def process_pdf(file_path: Path):
    """ Extracts text from .pdf files
    Args:
        file_path(Path) : Path object that contains the file_path of the .pdf file
    Returns:
        list : The sentences extracted from the file
    """
    sentences = []
    with open(file_path, "rb") as f:
        resmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(resmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(resmgr, device)
        for page in PDFPage.get_pages(f, caching=True, check_extractable=True):
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox):
                    sentences.append(clean_line(lt_obj.get_text()))
    return sentences

Beispiel #18

0

Datei anzeigen

Datei: soft2hard.py Projekt: pierregodard/word_discovery

def segment_target(matrix, target):
    if not target:
        matrix = [list(i) for i in zip(*matrix)]
    finalString = ""
    lastCol = -1
    for i in range(1, len(matrix)):  #for each element
        col = get_max_prob_col(i, matrix)
        if lastCol == -1:  #first character
            finalString += matrix[i][0]  #put the character in the beginning
        elif lastCol == col:  # if the current character and the last one are not separated
            finalString += matrix[i][0]
        else:
            finalString += " " + matrix[i][0]
        lastCol = col
    finalString = finalString.replace("  ", " ")
    if finalString[-1] == " ":
        finalString = finalString[:-1]
    if finalString[0] == " ":
        finalString = finalString[1:]
    return utils.clean_line(finalString)

Beispiel #19

0

Datei anzeigen

Datei: packageLoader.py Projekt: Stendec-UA/BEE2.4

    def parse(cls, data):
        conf = data.info.find_key('Config', '')
        mats = [
            prop.value
            for prop in
            data.info.find_all('AddIfMat')
        ]
        if conf.has_children():
            # Allow having a child block to define packlists inline
            files = [
                prop.value
                for prop in conf
            ]
        else:
            path = 'pack/' + conf.value + '.cfg'
            try:
                with data.zip_file.open(path) as f:
                    # Each line is a file to pack.
                    # Skip blank lines, strip whitespace, and
                    # alow // comments.
                    files = []
                    for line in f:
                        line = utils.clean_line(line)
                        if line:
                            files.append(line)
            except KeyError as ex:
                raise FileNotFoundError(
                    '"{}:{}" not in zip!'.format(
                        data.id,
                        path,
                    )
                ) from ex

        return cls(
            data.id,
            files,
            mats,
        )

Beispiel #20

0

Datei anzeigen

Datei: analyse_document_classification.py Projekt: schomper/jasper

def main():
    check_start('./test_doc_guess_algorithm.py <folder> <output> <num_topics>', 4)

    # Input naming
    folder = sys.argv[1]
    output = sys.argv[2]
    num_topics = int(sys.argv[3])

    topic_order_count = [0] * num_topics
    topic_feeling_count = [0] * num_topics
    topic_offset_count = [0] * num_topics

    # File allocation
    format_file = folder + '/initial.formatted'
    vocab_file = folder + '/initial.vocab'
    word_assignment_file = folder + '/out/word-assignments.dat'
    gamma_file = folder + '/out/final.gamma'

    # Read in required data
    with open(vocab_file) as data_file:
        vocab_lines = data_file.readlines()

    with open(word_assignment_file) as data_file:
        word_assignment_lines = data_file.readlines()

    with open(format_file) as data_file:
        format_lines = data_file.readlines()

    with open(gamma_file) as data_file:
        gamma_lines = data_file.readlines()

    vocab_details = get_vocab_details(vocab_lines, word_assignment_lines)
    topic_lines = get_document_topics(gamma_lines)
    vocab_index = get_vocab_index(vocab_lines)

    num_docs = len(format_lines)

    for index in range(len(format_lines)):
        line = format_lines[index]
        line = line.strip()

        known_topics = topic_lines[index].strip().split(' ')
        document_contents = line.split('|~|')[3]
        document_contents = clean_line(document_contents)
        classed_doc = classify(document_contents, vocab_details, vocab_index)

        if len(classed_doc) == 0:
            print(document_contents)
            continue

        if index % 1000 == 0 and index != 0:
            print('Snapshot: ' + str(index))
            for i in range(1, num_topics):
                print('\tTopics correct position %d: %s' % (i, str(topic_order_count[i] / index)))
                print('\tTopics within range %d: %s' % (i, str(topic_feeling_count[i] / index)))
                print('\tTopics within offset %d: %s' % (i, str(topic_offset_count[i] / index)))

        for i in range(1, len(classed_doc)):
            if compare_topics_order(classed_doc, known_topics, i):
                topic_order_count[i] += 1

            if compare_topic_slice(classed_doc, known_topics, i):
                topic_feeling_count[i] += 1

            if compare_position_offset(classed_doc, known_topics, i):
                topic_offset_count[i] += 1

    out_ptr = open(output, 'w')

    out_ptr.write('Number of documents: ' + str(num_docs) + '\n')
    for i in range(num_topics):
        out_ptr.write('Topics correct position %d: %s\n' % (i, str(topic_order_count[i] / num_docs)))
        out_ptr.write('Topics within range %d: %s\n' % (i, str(topic_feeling_count[i] / num_docs)))
        out_ptr.write('Topics within offset %d: %s\n' % (i, str(topic_offset_count[i] / num_docs)))

Beispiel #21

0

Datei anzeigen

def write_graph(input_path,
                output_path_v,
                output_path_e,
                instance_dict=None,
                lines_to_read=0,
                vertex_dict=None,
                print_details=True,
                instance_dict_vertices_only=True,
                ignore_literals=True,
                add_orphan_vertices=False,
                add_triples=False,
                track_vertex_pairs=False,
                track_all_edges=False,
                edge_set=None,
                skip_existing_pairs=False):
    """
    :param input_path: a text file containing a list of RDF triples;
    :param output_path_v: the file path where the vertices are written;
    :param output_path_e: the file path where the edges are written;
    :param graph_dict_path: path to where the vertex id dictionary is stored,
        as a pickled file;
    :param instance_dict: the dictionary that contains the instance type of each vertex;
    :param lines_to_read: the maximum number of triples to be read;
        if < 1 read the entire file;
    :param add_orphan_vertices: if True, add to the graph all the vertices in instance dict;
    :param print_details: boolean, if True print details about the state of the processing;
    :param add_triples: if True, return a dictionary that contains all the 
        triples that have been addded to the graph;
    :param track_vertex_pairs: if True, store the vertex pairs that are added as edges, 
        in a directed way.
    :param track_all_edges: if True, store all the vertex pairs,
        otherwise store only redirects/disambiguations.
    :param edge_set: if present, add vertex pairs to this set;
    :param skip_existing_pairs: if True, don't add the edges that are present in the edge_set;
    :return: tuple that contains the number of lines that have been read,
        and the number of vertices in the graph;
        
    Write a list of RDF triples into a PGX-compatible graph, 
    written in EDGELIST format. 
    """

    # Read the input file line-by-line, and add the information to a dictionary
    # that represents the graph;
    # Write the graph as EDGELIST text files;

    # A dictionary in which each entity is given a unique id;
    # If specified, load an existing one;
    if vertex_dict is not None:
        write_mode = "a+"
    else:
        vertex_dict = {}
        write_mode = "w+"

    # Time the execution;
    start_time = time.time()

    triple_dict = {}
    if edge_set is None:
        edge_set = set()
    edge_filter = ["wikiPageRedirects", "wikiPageDisambiguates"]

    current_line = 0
    edge_count = 0
    skipped_self_loops = 0
    # ID of the next vertex to be added;
    # using len(graph) + 1 allows incremental additions;
    vertex_id = len(vertex_dict) + 1
    with open(output_path_v, write_mode, encoding="utf-8") as outfile_v:
        with open(output_path_e, write_mode, encoding="utf-8") as outfile_e:
            with utils.read_compressed(input_path) as infile:
                for line in infile:
                    # Skip the header line, and skip commented-out lines;
                    if current_line != 0 and line[0] != "#":
                        # Create a triple from the given line;
                        triple = utils.clean_line(
                            line,
                            ignore_literals,
                            obtain_resource_manually=True)
                        source, relation, destination = triple

                        if source == destination:
                            skipped_self_loops += 1

                        # It is possible to skip the current edge if its vertices are not in the list of vertices;
                        if not instance_dict_vertices_only or (instance_dict is None) \
                            or ((source in instance_dict) and (destination in instance_dict)):

                            # Add the triple to the graph;
                            # Also add a unique vertex_id to each vertex that is added;

                            # The third element is processed first,
                            # so that we have its unique ID if we have to add a new edge;
                            skip_dest = False
                            # Check if the current triple should be skipped;
                            if (source == "" or relation == ""):
                                skip_dest = True
                            if ignore_literals and destination == "":
                                skip_dest = True

                            # Add source and destination vertices;

                            # Add the source vertex;
                            if source not in vertex_dict:
                                # Keep track of the vertex with a unique ID
                                vertex_dict[source] = vertex_id

                                # Write the name of the entity and its type;
                                # each line has the form "entity_name, {instance_type}";
                                outfile_v.write('"{}" * "{}"\n'.format(
                                    source,
                                    get_instance_type(source, instance_dict)))
                                vertex_id += 1

                            # Add the destination vertex;
                            if (not skip_dest) and (destination
                                                    not in vertex_dict):
                                # Keep track of the vertex with a unique ID
                                vertex_dict[destination] = vertex_id

                                # Write a new vertex like before;
                                outfile_v.write('"{}" * "{}"\n'.format(
                                    destination,
                                    get_instance_type(destination,
                                                      instance_dict)))
                                vertex_id += 1

                            # Add a new edge;

                            # Skip self-loops;
                            if not skip_dest and (source != destination):
                                if add_triples:
                                    if source in triple_dict:
                                        triple_dict[source] += [triple]
                                    else:
                                        triple_dict[source] = [triple]
                                # Write a new edge;

                                if not (skip_existing_pairs and
                                        (vertex_dict[source],
                                         vertex_dict[destination])
                                        in edge_set):
                                    outfile_e.write('"{}" "{}" "{}"\n'\
                                                    .format(source, destination, relation))
                                    edge_count += 1
                                # Keep track of the pairs (source, destination), after writing the current edge
                                # (otherwise no edge is added!);
                                if track_vertex_pairs:
                                    if not track_all_edges and relation in edge_filter:
                                        edge_set.add(
                                            (vertex_dict[source],
                                             vertex_dict[destination]))
                                    else:
                                        edge_set.add(
                                            (vertex_dict[source],
                                             vertex_dict[destination]))

                    current_line += 1

                    if not current_line % 100000 and print_details:
                        print("\tLINES READ: {} -- TIME: {:.2f} seconds \
                              -- TOT. VERTICES: {} -- EDGES ADDED: {}".format(
                            current_line,
                            time.time() - start_time, vertex_id, edge_count))

                    # Stop reading if enough lines have been read;
                    if lines_to_read > 0 and current_line > lines_to_read:
                        break

        # Add all the remaining vertices;
        if add_orphan_vertices and instance_dict is not None:
            additional_vertices = 0
            for v in instance_dict:
                if v not in vertex_dict:
                    vertex_dict[v] = vertex_id
                    outfile_v.write('"{}" * "{}"\n'.format(
                        v, get_instance_type(v, instance_dict)))
                    vertex_id += 1
                    additional_vertices += 1

            if print_details:
                print("ADDITIONAL VERTICES FROM INSTANCE DICT: {}".format(
                    additional_vertices))

    print("SKIPPED SELF LOOPS: {}".format(skipped_self_loops))

    return (current_line - 1, triple_dict, edge_set, vertex_dict, edge_count)

Beispiel #22

0

Datei anzeigen

Datei: gameMan.py Projekt: xDadiKx/BEE2.4

    def edit_gameinfo(self, add_line=False):
        """Modify all gameinfo.txt files to add or remove our line.

        Add_line determines if we are adding or removing it.
        """

        if self.is_modded() == add_line:
            # It's already in the correct state!
            return

        for folder in self.dlc_priority():
            info_path = os.path.join(self.root, folder, 'gameinfo.txt')
            if os.path.isfile(info_path):
                with open(info_path) as file:
                    data = list(file)

                for line_num, line in reversed(list(enumerate(data))):
                    clean_line = utils.clean_line(line)
                    if add_line:
                        if clean_line == GAMEINFO_LINE:
                            break  # Already added!
                        elif '|gameinfo_path|' in clean_line:
                            print("Adding gameinfo hook to " + info_path)
                            # Match the line's indentation
                            data.insert(
                                line_num+1,
                                utils.get_indent(line) + GAMEINFO_LINE + '\n',
                                )
                            break
                    else:
                        if clean_line == GAMEINFO_LINE:
                            print("Removing gameinfo hook from " + info_path)
                            data.pop(line_num)
                            break
                else:
                    if add_line:
                        print(
                            'Failed editing "' +
                            info_path +
                            '" to add our special folder!'
                        )
                    continue

                with open(info_path, 'w') as file:
                    for line in data:
                        file.write(line)
        if add_line:
            with open(self.abs_path('BEE2_EDIT_FLAG'), 'w') as file:
                file.write('')
        else:
            os.remove(self.abs_path('BEE2_EDIT_FLAG'))
            # Restore the original files!
            for name, file, ext in FILES_TO_BACKUP:
                item_path = self.abs_path(file + ext)
                backup_path = self.abs_path(file + '_original' + ext)
                old_version = self.abs_path(file + '_styles' + ext)
                if os.path.isfile(old_version):
                    print("Restoring Stylechanger version of " + name + "!")
                    shutil.copy(old_version, item_path)
                elif os.path.isfile(backup_path):
                    print("Restoring original " + name + "!")
                    shutil.move(backup_path, item_path)
            self.clear_cache()

Beispiel #23

0

Datei anzeigen

 def gen_train_samples(self):
     self.original2corp()
     sample_set = {}
     np.random.seed(8)
     # 加载数据，以文本为单位
     important_tokens = []
     text = open(FLAGS.corpus_file, encoding="utf8").readlines()[:10]
     print("select important tokens...")
     for e in tqdm(text, total=len(text)):
         tmp = self.tokenizer.select_important_tokens(clean_line(e.strip()))
         if len(tmp) < 10: continue
         important_tokens.append(tmp)
     # 采样正负样本，同一个文本中的词为正样本，不同文本中的词为负样本
     print("sample(1+k negative) train and valid set...")
     num_neg = min(len(important_tokens) - 1, MAX_NUM_NEG)
     for cur_index, cur_ele in tqdm(enumerate(important_tokens),
                                    total=len(important_tokens)):
         np.random.shuffle(cur_ele)
         cut_index = int(len(cur_ele) / 3)
         lhs, rhs = cur_ele[:cut_index], cur_ele[cut_index:]
         for word_index, word in enumerate(lhs):
             if word in sample_set: continue
             positive_entity = rhs[word_index]  # 正样本
             # 负采样
             negative_entitys, negs = [], []
             negative_indexes = [
                 i for i in range(len(important_tokens)) if i != cur_index
             ]
             random.shuffle(negative_indexes)
             for e in negative_indexes:
                 if (len(negs) >= num_neg): break
                 if word in important_tokens[
                         e] or positive_entity in important_tokens[e]:
                     continue
                 negs.append(e)
             for neg_index in negs:
                 while True:
                     neg_tmp = random.sample(important_tokens[neg_index],
                                             1)[0]
                     if neg_tmp != word and neg_tmp not in negative_entitys:
                         break
                 negative_entitys.append(neg_tmp)
             assert len(negative_entitys) == num_neg
             # 采样数少的情况下进行填充
             #if len(negative_entitys) < num_neg:
             #    negative_entitys += ["PAD"] * (num_neg - len(negative_entitys))
             sample_set[word] = [positive_entity, negative_entitys]
     # 产生字典
     token_freq = defaultdict(int)
     token_freq["UNKNOWN"] = 1e8
     #token_freq["PAD"] = 1e8-1
     for k, (p, n) in sample_set.items():
         tmp = [k, p] + n
         for t in tmp:
             if re_en.fullmatch(t): token_freq[t] += 1
             else:
                 for e in list(t):
                     token_freq[e] += 1
     sorted_token_freq = sorted(token_freq.items(),
                                key=lambda d: d[1],
                                reverse=True)[:VOCAB_SIZE]
     word2id = {w: i for i, (w, f) in enumerate(sorted_token_freq)}
     if conf.over_write_vocab:
         print("generate word2id file: %s" % (conf.vocab))
         json.dump(word2id,
                   open(conf.vocab, "w", encoding="utf8"),
                   ensure_ascii=False,
                   indent=2)
     _keys_ = list(sample_set.keys())
     train_set = {
         k: sample_set[k]
         for k in _keys_[:int(len(_keys_) * conf.train_valid_ratio)]
     }
     valid_set = {
         k: sample_set[k]
         for k in _keys_[int(len(_keys_) * conf.train_valid_ratio):]
     }
     print("total_sample: %d\ttrain_sample: %d\tvalid_sample :%d" %
           (len(sample_set), len(train_set), len(valid_set)))
     print("generate train sample file :%s\tvalid sample file: %s" %
           (conf.train_samples, conf.valid_samples))
     json.dump(train_set,
               open(conf.train_samples, "w", encoding="utf8"),
               ensure_ascii=False,
               indent=2)
     json.dump(valid_set,
               open(conf.valid_samples, "w", encoding="utf8"),
               ensure_ascii=False,
               indent=2)

Beispiel #24

0

Datei anzeigen

Datei: property_parser.py Projekt: NotFirestorm348/BEE2.4

    def parse(file_contents, filename='') -> "List of Property objects":
        """Returns list of Property objects parsed from given text"""
        open_properties = [Property(None, [])]

        for line_num, line in enumerate(file_contents, start=1):
            values = open_properties[-1].value
            freshline = utils.clean_line(line)
            if not freshline:
                # Skip blank lines!
                continue

            if freshline.startswith('"'):   # data string
                line_contents = freshline.split('"')
                name = line_contents[1]
                if not utils.is_identifier(name):
                    raise KeyValError(
                        'Invalid name ' + name + '!',
                        filename,
                        line_num,
                        )
                try:
                    value = line_contents[3]
                    if not freshline.endswith('"'):
                        raise KeyValError(
                            'Key has value, but incomplete quotes!',
                            filename,
                            line_num,
                            )
                    for orig, new in REPLACE_CHARS.items():
                        value = value.replace(orig, new)
                except IndexError:
                    value = None

                values.append(Property(name, value))
            # handle name bare on one line, will need a brace on
            # the next line
            elif utils.is_identifier(freshline):
                values.append(Property(freshline, []))
            elif freshline.startswith('{'):
                if values[-1].value:
                    raise KeyValError(
                        'Property cannot have sub-section if it already'
                        'has an in-line value.',
                        filename,
                        line_num,
                        )
                values[-1].value = []
                open_properties.append(values[-1])
            elif freshline.startswith('}'):
                open_properties.pop()
            else:
                raise KeyValError(
                    "Unexpected beginning character '"
                    + freshline[0]
                    + '"!',
                    filename,
                    line_num,
                    )

            if not open_properties:
                raise KeyValError(
                    'Too many closing brackets.',
                    filename,
                    line_num,
                    )
        if len(open_properties) > 1:
            raise KeyValError(
                'End of text reached with remaining open sections.',
                filename,
                line=None,
                )
        return open_properties[0]

Beispiel #25

0

Datei anzeigen

Datei: property_parser.py Projekt: Stendec-UA/BEE2.4

    def parse(file_contents, filename='') -> "Property":
        """Returns a Property tree parsed from given text.

        filename, if set should be the source of the text for debug purposes.
        file_contents should be an iterable of strings
        """
        open_properties = [Property(None, [])]
        for line_num, line in enumerate(file_contents, start=1):
            values = open_properties[-1].value
            freshline = utils.clean_line(line)
            if not freshline:
                # Skip blank lines!
                continue

            if freshline.startswith('"'):   # data string
                line_contents = freshline.split('"')
                name = line_contents[1]
                if not utils.is_identifier(name):
                    raise KeyValError(
                        'Invalid name ' + name + '!',
                        filename,
                        line_num,
                        )
                try:
                    value = line_contents[3]
                    if not freshline.endswith('"'):
                        raise KeyValError(
                            'Key has value, but incomplete quotes!',
                            filename,
                            line_num,
                            )
                    for orig, new in REPLACE_CHARS.items():
                        value = value.replace(orig, new)
                except IndexError:
                    value = None

                values.append(Property(name, value))
            elif freshline.startswith('{'):
                if values[-1].value:
                    raise KeyValError(
                        'Property cannot have sub-section if it already'
                        'has an in-line value.',
                        filename,
                        line_num,
                        )
                values[-1].value = []
                open_properties.append(values[-1])
            elif freshline.startswith('}'):
                open_properties.pop()
            # handle name bare on one line, will need a brace on
            # the next line
            elif utils.is_identifier(freshline):
                values.append(Property(freshline, []))
            else:
                raise KeyValError(
                    "Unexpected beginning character '"
                    + freshline[0]
                    + '"!',
                    filename,
                    line_num,
                    )

            if not open_properties:
                raise KeyValError(
                    'Too many closing brackets.',
                    filename,
                    line_num,
                    )
        if len(open_properties) > 1:
            raise KeyValError(
                'End of text reached with remaining open sections.',
                filename,
                line=None,
                )
        return open_properties[0]