コード例 #1
0
    def __load(self, ifile, encd):
        '''Load regular expressions from ifile. '''
        re_list = [RegExpStruct()]
        match = None
        cnt = 0
        finput = AltFileInput(ifile, encoding = encd)

        for line in finput:
            match = RE_OPTIONS.match(line)
            # different regexp options will separate different
            # chunks of regular expressions
            if match:
                # increment counter only if we have already seen any
                # regexps before
                if cnt != 0 or re_list[0][0]:
                    re_list.append(RegExpStruct())
                    cnt += 1
                # securily interpret options passed as strings as valid python
                # code
                re_list[cnt][1] = self.__parse_options(match)
            else:
                # strip off comments
                line = skip_comments(line)
                # and remember the line if it is not empty
                if line:
                    re_list[cnt][0].extend(self.istring_hook(line))
        return self.compile(re_list)
コード例 #2
0
 def __init__(self, fname, encoding = 'utf-8'):
     """Read P2P rules from file and populate instance."""
     self.rules = []
     self.flags = ''
     ifile = AltFileInput(fname, encoding = 'utf-8')
     for line in ifile:
         self.__parse(line)
コード例 #3
0
def load_polar_dicts(dfnames):
    """Load polar words into polarity dictionary."""
    global polar_dict
    # iterate over name of polarity dictionaries
    finput = AltFileInput(*dfnames)
    word = tag = ""
    score = 0
    for iline in finput:
        if not COMMENT_RE.match(iline):
            word, tag, score = iline.split('\t')
            if tag == ANY_TAG:
                polar_dict[word.lower()] = abs(float(score))
            else:
                # abs(float(score))
                polar_dict[(word.lower(), tag)] = abs(float(score))
コード例 #4
0
    def __init__(self, ifile=DEFAULT_NR_FILE):
        """Create an instance of NoiseRestorer.

        @param ifile - name of file containing list of elements which should be
                       restored

        """
        # set of words which are replecements that should be restored
        self.rwords = set([])
        # list of regexps, which are checked against replacements and once they
        # match these replecements, those replacements should be restored to
        # original form
        self.rre = []
        # container for storing replacement information
        self.restoreList = []
        self.tokenOffsets = Offsets()
        finput = AltFileInput(ifile)
        mobj = None
        for line in finput:
            line = skip_comments(line)
            if not line:
                continue
            mobj = __RWORD_RE__.match(line)
            if mobj:
                self.rwords.add(mobj.group(1))
                continue
            mobj = __RREX_RE__.match(line)
            if mobj:
                self.rre.append("(?:" + mobj.group(1) + ")")
                continue
            raise RuleFormatError(
                "Unrecognized line format for NoiseRestorer.")
        if self.rre:
            self.rre = re.compile("(?:" + '|'.join(self.rre) + ")")
        else:
            self.rre = re.compile("(?!)")
        self.t_offset = -1
        self.r_offset = -1
        self.t_length = -1
        self.r_length = -1
コード例 #5
0
 def __load(self, ifile):
     """Load map entries from file ifile."""
     # load map entries from file
     output = {}
     optmatch = None
     finput = AltFileInput(ifile, encoding=self.encd)
     for line in finput:
         if line:
             optmatch = RE_OPTIONS.match(line)
             if optmatch:
                 if self.flags:
                     raise  RuleFormatError( \
                         msg = "Multiple flag lines are not supported", \
                             efile = finput)
                 else:
                     self.flags = optmatch.group(1)
                     self.ignorecase = RegExp(self.flags,
                                              "").re.flags & re.IGNORECASE
                     continue
             # find map entries
             line = skip_comments(line)
             m = MAP_DELIMITER.search(line)
             if m:
                 src, trg = self.__normalize_quotes(line[0:m.start()], \
                                                        line[m.end():])
                 if not (src and trg):
                     print src.encode('utf-8')
                     print trg.encode('utf-8')
                     raise RuleFormatError(efile=finput)
                 src = re.escape(src)
                 if self.ignorecase:
                     output[src.lower()] = trg
                 else:
                     output[src] = trg
             elif line:
                 raise RuleFormatError(efile=finput)
     return output
コード例 #6
0
                       help="switch verbose statistics mode on",
                       action="store_true")
args = argparser.parse_args()

##################################################################
# Main
unigram_prob = pickle.load(args.unigram_prob_file)
args.unigram_prob_file.close()
bigram_prob = pickle.load(args.bigram_prob_file)
args.bigram_prob_file.close()

esc_char = args.esc_char
skip_line = args.skip_line

foutput = AltFileOutput(encoding=args.encoding, flush=args.flush)
finput = AltFileInput(*args.files, print_func=foutput.fprint, errors='replace')
memory = Memory()

# unfortunately, rules for restoration of misspellings are currently hard-coded
# in the `misspellings.py` file
misspelling_restorer = MisspellingRestorer(unigram_prob, bigram_prob)

# iterate over input lines, skip empty and skip lines, pre-cache information
# about replacements
for line in finput:
    # print empty and skip lines unchanged
    if line == skip_line or not line:
        # check if memory is empty and print it otherwise
        print_mem()
        foutput.fprint(line)
    # check if current line contains meta information
コード例 #7
0
                               action = 'store_true', default=True)
    argparser.add_argument("files", help = "input files in which equal and odd strings should be aligned", \
                               nargs = '*', type = argparse.FileType('r'), \
                               default = [sys.stdin])
    args = argparser.parse_args()
    # input/output encoding
    enc = args.encoding
    # determine which type of alignment is requested
    if args.needleman_wunsch:
        alignfunc = nw_align
    else:
        alignfunc = hb_align
    # establish Input/Output
    foutput = AltFileOutput(encoding=args.encoding)
    finput    = AltFileInput(*args.files, \
                              print_func = foutput.fprint, \
                              errors = "replace")

    # auxiliary variables
    line1 = line2 = ''
    oline1 = []
    oline2 = []
    alignment = []
    c_list = []
    c_i = c_len = 0
    fnr = 0

    # iterate over input lines
    for line in finput:
        if finput.fnr % 2 == 0:
            line2 = line
コード例 #8
0
precede lines with meta-information""", nargs = 1, type = str, \
                           default = os.environ.get("SOCMEDIA_ESC_CHAR", ""))
    argparser.add_argument("-e", "--encoding", help="input/output encoding", \
                           default = DEFAULT_LANG)
    argparser.add_argument("conll_file",
                           help="file with DG trees in CONLL format")
    argparser.add_argument("token_file",
                           help="file with original tokenization")
    argparser.add_argument("word_file", help="file with MMAX words")
    argparser.add_argument("annotation_files",
                           help="files with MMAX markables",
                           nargs='*')
    args = argparser.parse_args()

    # variables
    esc_char = args.esc_char
    foutput = AltFileOutput(encoding=args.encoding)
    finput = AltFileInput(args.conll_file, print_func=foutput.fprint)

    # skip files with no annotation
    if not args.annotation_files:
        sys.exit(0)
    # read and parse CONLL file
    conlldic = read_conll(finput)
    # read and parse tokenization file
    tkndoc = ET.parse(args.token_file)
    # read and parse MMAX word file
    wrddoc = ET.parse(args.word_file)
    # merge annotation with CONLL data
    merge_conll_mmax_doc(conlldic, tkndoc, wrddoc, args.annotation_files)