def _process_sentence_tt(self, sentence, counter):
        """Process sentence with Treetagger"""
        tokens_pos_tagged = []

        treetagger_tokens = self.treetagger.TagText(sentence)
        token_pos_tagged = None
        for token in treetagger_tokens:
            token_pos_tagged = token.split("\t")
            if len(token_pos_tagged) != 3:
                print >> stderr, "Caution -- broken TreeTagger case: ", token_pos_tagged, "(list)"
                continue  # Skip it
            pos_tag = getTag(token_pos_tagged[1], lang_1)
            token = token_pos_tagged[2].lower()
            # Those cases we don't want.
            if (
                not token in ["<unknown>", "@ord@", "@card@"]
                and not pos_tag == NO_POS_SYM
                and len(token) <= max_word_len
                token += "_" + pos_tag + "_" + self.lang

        self.sentences[counter] = tokens_pos_tagged
def main():
    global input_is_tokenized, use_lemmatization, space_cols_file, \
           loaded_space_file_s, loaded_space_file_t, source_lang, \
           target_lang, input_file, output_file, tag_cutoff, \
           no_stopword_print, number_of_translations, \
           number_of_neighbours, different_pos_punishment, \
    parser = argparse.ArgumentParser(description="Word translations" + \
                                     " that fit best to the sentence")
    parser.add_argument("-k", "--tokenized", 
           help="use pretokenized input", action="store_true")
    parser.add_argument("-l", "--lemmatized", 
           help="use lemmatization", action="store_true")
    parser.add_argument("-p", "--returntag", 
           help="return language tag", action="store_true")
    parser.add_argument("-d", "--dimensions", type=str,
           help="column file for the input matrix")
    parser.add_argument("-m", "--sourcematrix", type=str,
           help="pickled input matrix for source language")
    parser.add_argument("-y", "--targetmatrix", type=str,
           help="pickled input matrix for target language")
    parser.add_argument("-s", "--sourcelang", type=str, 
           help="input language")
    parser.add_argument("-t", "--targetlang", type=str,
           help="output language")
    parser.add_argument("-i", "--infile", type=str, 
           help="input file")
    parser.add_argument("-o", "--outfile", type=str, 
           help="output file")
    parser.add_argument("-nsp", "--no-stopword-print", 
           help="Omit to print words without candidates -- usually " + \
                 "stop words.")
    parser.add_argument("-nt", "--number-of-translations", type=float,
           help="The number of candidates to show for each input word.")
    parser.add_argument("-nn", "--number-of-neighbours", type=int,
           help="The number of neighbours for each input word to " + \
                "consider in the similarity space constructed.")
    parser.add_argument("-dpp", "--different-pos-punishment", 
           type=float, help="The score's fraction to punish a " + \
                             "candidate word which is there, but " + \
                             "has not the same POS as its input peer.")
    args = parser.parse_args()
    if args.sourcelang:
        source_lang = args.sourcelang
    if args.targetlang:
        target_lang = args.targetlang
    if args.tokenized:
        input_is_tokenized = True
    if args.lemmatized:
        use_lemmatization = True
    if args.dimensions:
        space_cols_file = args.dimensions
    elif source_lang == target_lang:
        space_cols_file = DATA_DIR_OUT + source_lang + '-words.col'
        space_cols_file = DATA_DIR_OUT \
                        + '_'.join(sorted([source_lang,target_lang])) \
                        + '-words.col'
    if args.sourcematrix:
        loaded_space_file_s = args.sourcematrix
    elif source_lang == target_lang:
        loaded_space_file_s = DATA_DIR_OUT + source_lang + '.pkl'
        loaded_space_file_s = DATA_DIR_OUT + source_lang \
                            + '_' + source_lang + '-' + target_lang \
                            + '.pkl'
    if args.targetmatrix:
        loaded_space_file_t = args.targetmatrix
    elif source_lang == target_lang and loaded_space_file_t == "":
        loaded_space_file_t = DATA_DIR_OUT + target_lang + '.pkl'
        loaded_space_file_t = DATA_DIR_OUT + target_lang \
                            + '_' + target_lang + '-' + source_lang \
                            + '.pkl'
    if args.infile:
        input_file = open(args.infile, "r")
    if args.outfile:
        output_file = open(args.outfile, "w")
    if args.returntag:
        tag_cutoff = 0
        if args.lemmatized:
            tag_cutoff = 5
            tag_cutoff = 3
    if args.no_stopword_print:
        no_stopword_print = args.no_stopword_print

    # vector dimension/columns for input matrix and matrix per sentence
    space_cols_fileobject = open(space_cols_file, "r")
    # space_cols = space_cols_fileobject.readlines()
    space_cols ="\n")[:-1] 

    # load the space
    loaded_space = {}
    loaded_space[source_lang] = io_utils.load(loaded_space_file_s)
    # only load it once for similary queries in the same language
    if not loaded_space.get(target_lang):
        loaded_space[target_lang] = io_utils.load(loaded_space_file_t)

    # Initialize TreeTagger only once (for later use)
    treetagger = TreeTagger(TAGLANG=source_lang, TAGDIR=treetagger_path,
                            TAGINENC=ENC, TAGOUTENC=ENC)
    # work on input file
    while True:
        line = input_file.readline()
        words = [] # words in sentence
        lemmas = [] # lemmas in sentence
        pos = [] # part-of-speech tags per word in sentence
        formatted = []
        # matrix for sentence
        freq = defaultdict(lambda: defaultdict(int))

        # Stop when file is entirely read
        if not line:

        # For pre-treetagged text
        if input_is_tokenized:
            while not re.match(r'[.:?!]', line):
                t = line.rstrip()
                w = t.split("\t")[0]
                p = helpers.getTag(t.split("\t")[1], source_lang)
                l = t.split("\t")[2]
                formatted.append(helpers.dimensionformat(w, p, l, 
                                 source_lang, use_lemmatization))
                line = input_file.readline()
                if not line:

        # Use tree-tagger as lemmatizer and/or tokenizer
            treetagger_sentence = treetagger.TagText(line)
            for t in treetagger_sentence:
                    w = t.split("\t")[0]
                    p = helpers.getTag(t.split("\t")[1], source_lang)
                    l = t.split("\t")[2]
                    print >> sys.stderr, \
                             "Caution: TreeTagger token cannot " + \
                             "be processed:", t
                    continue # Skip it
                formatted.append(helpers.dimensionformat(w, p, l, 
                                 source_lang, use_lemmatization))

        # fill matrix for sentence
        for i in formatted:
            for j in formatted:
                freq[i][j] += 1

        # bild unique list of the words in this sentence for the rows
        uniqwords = set()
        for l in formatted:
        query_rows = list(uniqwords) # rows for sentence matrix

        # dissect compatible matrix
        m = np.mat(np.zeros(shape=(len(query_rows), len(space_cols))))

        # convert sentence matrix to compatible matrix
        for i in range(len(query_rows)):
            for j in range(len(space_cols)):
                m[i, j] = freq[query_rows[i]][space_cols[j]]

        # build dissect matrix
        query_space = Space(DenseMatrix(m), query_rows, space_cols)

        # for every word print neighbours with similarity
        for i in range(len(words)):
            best_translations = get_best_translations(words[i], pos[i], 
                                lemmas[i], query_space, loaded_space)
            output_file.write(format_best_translations(words[i], pos[i], 
                              lemmas[i], best_translations))

        if input_is_tokenized:
            output_file.write(line.split("\t")[0] + "\n")

    if args.infile:
    if args.outfile: