Python getTag Examples

Programming Language: Python

Namespace/Package Name: helpers

Method/Function: getTag

Examples at hotexamples.com: 2

Python getTag - 2 examples found. These are the top rated real world Python examples of helpers.getTag extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: create_input_data.py Project: 2mh/europarl-dissect

    def _process_sentence_tt(self, sentence, counter):
        """Process sentence with Treetagger"""
        tokens_pos_tagged = []

        treetagger_tokens = self.treetagger.TagText(sentence)
        token_pos_tagged = None
        for token in treetagger_tokens:
            token_pos_tagged = token.split("\t")
            if len(token_pos_tagged) != 3:
                print >> stderr, "Caution -- broken TreeTagger case: ", token_pos_tagged, "(list)"
                continue  # Skip it
            pos_tag = getTag(token_pos_tagged[1], lang_1)
            token = token_pos_tagged[2].lower()
            # Those cases we don't want.
            if (
                not token in ["<unknown>", "@ord@", "@card@"]
                and not pos_tag == NO_POS_SYM
                and len(token) <= max_word_len
            ):
                token += "_" + pos_tag + "_" + self.lang
                tokens_pos_tagged.append(token)

        self.sentences[counter] = tokens_pos_tagged

Example #2

Show file

File: besttranslations.py Project: 2mh/europarl-dissect

def main():
    global input_is_tokenized, use_lemmatization, space_cols_file, \
           loaded_space_file_s, loaded_space_file_t, source_lang, \
           target_lang, input_file, output_file, tag_cutoff, \
           no_stopword_print, number_of_translations, \
           number_of_neighbours, different_pos_punishment, \
           treetagger_path
    
    parser = argparse.ArgumentParser(description="Word translations" + \
                                     " that fit best to the sentence")
    parser.add_argument("-k", "--tokenized", 
           help="use pretokenized input", action="store_true")
    parser.add_argument("-l", "--lemmatized", 
           help="use lemmatization", action="store_true")
    parser.add_argument("-p", "--returntag", 
           help="return language tag", action="store_true")
    parser.add_argument("-d", "--dimensions", type=str,
           help="column file for the input matrix")
    parser.add_argument("-m", "--sourcematrix", type=str,
           help="pickled input matrix for source language")
    parser.add_argument("-y", "--targetmatrix", type=str,
           help="pickled input matrix for target language")
    parser.add_argument("-s", "--sourcelang", type=str, 
           help="input language")
    parser.add_argument("-t", "--targetlang", type=str,
           help="output language")
    parser.add_argument("-i", "--infile", type=str, 
           help="input file")
    parser.add_argument("-o", "--outfile", type=str, 
           help="output file")
    parser.add_argument("-nsp", "--no-stopword-print", 
           action="store_true", 
           help="Omit to print words without candidates -- usually " + \
                 "stop words.")
    parser.add_argument("-nt", "--number-of-translations", type=float,
           help="The number of candidates to show for each input word.")
    parser.add_argument("-nn", "--number-of-neighbours", type=int,
           help="The number of neighbours for each input word to " + \
                "consider in the similarity space constructed.")
    parser.add_argument("-dpp", "--different-pos-punishment", 
           type=float, help="The score's fraction to punish a " + \
                             "candidate word which is there, but " + \
                             "has not the same POS as its input peer.")
    parser.add_argument
    args = parser.parse_args()
    
    if args.sourcelang:
        source_lang = args.sourcelang
    if args.targetlang:
        target_lang = args.targetlang
    if args.tokenized:
        input_is_tokenized = True
    if args.lemmatized:
        use_lemmatization = True
    if args.dimensions:
        space_cols_file = args.dimensions
    elif source_lang == target_lang:
        space_cols_file = DATA_DIR_OUT + source_lang + '-words.col'
    else:
        space_cols_file = DATA_DIR_OUT \
                        + '_'.join(sorted([source_lang,target_lang])) \
                        + '-words.col'
    if args.sourcematrix:
        loaded_space_file_s = args.sourcematrix
    elif source_lang == target_lang:
        loaded_space_file_s = DATA_DIR_OUT + source_lang + '.pkl'
    else:
        loaded_space_file_s = DATA_DIR_OUT + source_lang \
                            + '_' + source_lang + '-' + target_lang \
                            + '.pkl'
    if args.targetmatrix:
        loaded_space_file_t = args.targetmatrix
    elif source_lang == target_lang and loaded_space_file_t == "":
        loaded_space_file_t = DATA_DIR_OUT + target_lang + '.pkl'
    else:
        loaded_space_file_t = DATA_DIR_OUT + target_lang \
                            + '_' + target_lang + '-' + source_lang \
                            + '.pkl'
    if args.infile:
        input_file = open(args.infile, "r")
    if args.outfile:
        output_file = open(args.outfile, "w")
    if args.returntag:
        tag_cutoff = 0
    else:
        if args.lemmatized:
            tag_cutoff = 5
        else:
            tag_cutoff = 3
    if args.no_stopword_print:
        no_stopword_print = args.no_stopword_print

    # vector dimension/columns for input matrix and matrix per sentence
    space_cols_fileobject = open(space_cols_file, "r")
    # space_cols = space_cols_fileobject.readlines()
    space_cols = space_cols_fileobject.read().split("\n")[:-1] 
    space_cols_fileobject.close()

    # load the space
    loaded_space = {}
    loaded_space[source_lang] = io_utils.load(loaded_space_file_s)
    # only load it once for similary queries in the same language
    if not loaded_space.get(target_lang):
        loaded_space[target_lang] = io_utils.load(loaded_space_file_t)

    # Initialize TreeTagger only once (for later use)
    treetagger = TreeTagger(TAGLANG=source_lang, TAGDIR=treetagger_path,
                            TAGINENC=ENC, TAGOUTENC=ENC)
    
    # work on input file
    while True:
        line = input_file.readline()
        words = [] # words in sentence
        lemmas = [] # lemmas in sentence
        pos = [] # part-of-speech tags per word in sentence
        formatted = []
        # matrix for sentence
        freq = defaultdict(lambda: defaultdict(int))

        # Stop when file is entirely read
        if not line:
            break

        # For pre-treetagged text
        if input_is_tokenized:
            while not re.match(r'[.:?!]', line):
                t = line.rstrip()
                w = t.split("\t")[0]
                p = helpers.getTag(t.split("\t")[1], source_lang)
                l = t.split("\t")[2]
                words.append(w)
                lemmas.append(l)
                pos.append(p)
                formatted.append(helpers.dimensionformat(w, p, l, 
                                 source_lang, use_lemmatization))
                line = input_file.readline()
                if not line:
                    break

        # Use tree-tagger as lemmatizer and/or tokenizer
        else:
            treetagger_sentence = treetagger.TagText(line)
            for t in treetagger_sentence:
                try:
                    w = t.split("\t")[0]
                    p = helpers.getTag(t.split("\t")[1], source_lang)
                    l = t.split("\t")[2]
                except:
                    print >> sys.stderr, \
                             "Caution: TreeTagger token cannot " + \
                             "be processed:", t
                    continue # Skip it
                words.append(w)
                lemmas.append(l)
                pos.append(p)
                formatted.append(helpers.dimensionformat(w, p, l, 
                                 source_lang, use_lemmatization))

        # fill matrix for sentence
        for i in formatted:
            for j in formatted:
                freq[i][j] += 1

        # bild unique list of the words in this sentence for the rows
        uniqwords = set()
        for l in formatted:
            uniqwords.add(l)
        query_rows = list(uniqwords) # rows for sentence matrix

        # dissect compatible matrix
        m = np.mat(np.zeros(shape=(len(query_rows), len(space_cols))))

        # convert sentence matrix to compatible matrix
        for i in range(len(query_rows)):
            for j in range(len(space_cols)):
                m[i, j] = freq[query_rows[i]][space_cols[j]]

        # build dissect matrix
        query_space = Space(DenseMatrix(m), query_rows, space_cols)

        # for every word print neighbours with similarity
        for i in range(len(words)):
            best_translations = get_best_translations(words[i], pos[i], 
                                lemmas[i], query_space, loaded_space)
            output_file.write(format_best_translations(words[i], pos[i], 
                              lemmas[i], best_translations))

        if input_is_tokenized:
            output_file.write(line.split("\t")[0] + "\n")

            
    if args.infile:
        input_file.close()
    if args.outfile:
        output_file.close()