Ejemplo n.º 1
0
def _get_form2lemma(a_fname):
    """Read file containing form/lemma correspodences

    @param a_fname - name of input file

    @return void (correspondences are read into global variables)

    """
    global STOP_WORDS, FORM2LEMMA

    if not os.path.isfile(a_fname) or not os.access(a_fname, os.R_OK):
        raise RuntimeError("Cannot read from file '{:s}'".format(
            a_fname))

    iform = itag = ilemma = ""
    with codecs.open(a_fname, 'r', encoding=ENCODING) as ifile:
        for iline in ifile:
            iline = iline.strip()
            if not iline:
                continue
            iform, itag, ilemma = TAB_RE.split(iline)
            iform = normalize(iform)
            if len(itag) > 1 and itag[:2] in INFORMATIVE_TAGS:
                FORM2LEMMA[iform] = normalize(ilemma)
            else:
                STOP_WORDS.add(iform)
Ejemplo n.º 2
0
def _tkm_add_corpus(ising, a_cc_file):
    """Add lexical nodes from corpus to the Ising spin model

    @param a_ising - instance of the Ising spin model
    @param a_cc_file - file containing conjoined word pairs extracted from
      corpus

    @return \c void

    """
    ifields = []
    iwght = 1.
    ilemma1 = ilemma2 = ""
    with codecs.open(a_cc_file, 'r', ENCODING) as ifile:
        for iline in ifile:
            iline = iline.strip()
            if not iline:
                continue
            ifields = TAB_RE.split(iline)
            if len(ifields) != 3:
                continue
            ilemma1, ilemma2, iwght = ifields
            if ilemma1 in FORM2LEMMA:
                ilemma1 = FORM2LEMMA[ilemma1]
            if ilemma2 in FORM2LEMMA:
                ilemma2 = FORM2LEMMA[ilemma2]
            if check_word(ilemma1) and check_word(ilemma2):
                ising.add_edge(normalize(ilemma1),
                               normalize(ilemma2),
                               float(iwght),
                               a_add_missing=True)
Ejemplo n.º 3
0
def _get_form2lemma(a_fname):
    """Read file containing form/lemma correspodences

    @param a_fname - name of input file

    @return void (correspondences are read into global variables)

    """
    global STOP_WORDS, FORM2LEMMA

    if not os.path.isfile(a_fname) or not os.access(a_fname, os.R_OK):
        raise RuntimeError("Cannot read from file '{:s}'".format(a_fname))

    iform = itag = ilemma = ""
    with codecs.open(a_fname, 'r', encoding=ENCODING) as ifile:
        for iline in ifile:
            iline = iline.strip()
            if not iline:
                continue
            iform, itag, ilemma = TAB_RE.split(iline)
            iform = normalize(iform)
            if len(itag) > 1 and itag[:2] in INFORMATIVE_TAGS:
                FORM2LEMMA[iform] = normalize(ilemma)
            else:
                STOP_WORDS.add(iform)
Ejemplo n.º 4
0
def _read_set(a_fname):
    """Read initial seed set of terms.

    @param a_fname - name of input file containing terms

    @return void

    """
    global POS_SET, NEG_SET, NEUT_SET, POS_RE, NEG_RE
    fields = []
    pos_regs = []
    neg_regs = []
    with codecs.open(a_fname, 'r',
                     encoding=ENCODING) as ifile:
        for iline in ifile:
            iline = iline.strip()
            if not iline:
                continue
            elif iline.startswith(COMMENT):
                # maybe, we will later introduce some special comments
                continue
            fields = TAB_RE.split(iline)
            if len(fields) > 2 and fields[2] == REGEXP:
                if fields[1] == POSITIVE:
                    pos_regs.append(normalize_reg(fields[0]))
                elif fields[1] == NEGATIVE:
                    neg_regs.append(normalize_reg(fields[0]))
                else:
                    raise NotImplementedError(
                        "Regular expressions are not supported"
                        " for non-polar classes.")
                continue
            if fields[1] == POSITIVE:
                POS_SET.add(normalize(fields[0]))
            elif fields[1] == NEGATIVE:
                NEG_SET.add(normalize(fields[0]))
            elif fields[1] == NEUTRAL:
                NEUT_SET.add(normalize(fields[0]))
            else:
                raise RuntimeError(
                    "Unknown field specification: {:s}".format(fields[-1]))
    if pos_regs:
        POS_RE = join_regs(pos_regs)
    if neg_regs:
        NEG_RE = join_regs(neg_regs)
Ejemplo n.º 5
0
def _read_set(a_fname):
    """Read initial seed set of terms.

    @param a_fname - name of input file containing terms

    @return void

    """
    global POS_SET, NEG_SET, NEUT_SET, POS_RE, NEG_RE
    fields = []
    pos_regs = []
    neg_regs = []
    with codecs.open(a_fname, 'r', encoding=ENCODING) as ifile:
        for iline in ifile:
            iline = iline.strip()
            if not iline:
                continue
            elif iline.startswith(COMMENT):
                # maybe, we will later introduce some special comments
                continue
            fields = TAB_RE.split(iline)
            if len(fields) > 2 and fields[2] == REGEXP:
                if fields[1] == POSITIVE:
                    pos_regs.append(normalize_reg(fields[0]))
                elif fields[1] == NEGATIVE:
                    neg_regs.append(normalize_reg(fields[0]))
                else:
                    raise NotImplementedError(
                        "Regular expressions are not supported"
                        " for non-polar classes.")
                continue
            if fields[1] == POSITIVE:
                POS_SET.add(normalize(fields[0]))
            elif fields[1] == NEGATIVE:
                NEG_SET.add(normalize(fields[0]))
            elif fields[1] == NEUTRAL:
                NEUT_SET.add(normalize(fields[0]))
            else:
                raise RuntimeError("Unknown field specification: {:s}".format(
                    fields[-1]))
    if pos_regs:
        POS_RE = join_regs(pos_regs)
    if neg_regs:
        NEG_RE = join_regs(neg_regs)
Ejemplo n.º 6
0
def _read_files_helper(a_crp_files, a_encoding=ENCODING):
    """Read corpus files and execute specified function.

    @param a_crp_files - files of the original corpus
    @param a_encoding - encoding of the vector file

    @return (Iterator over file lines)

    """
    i = 0
    tokens_seen = False
    for ifname in a_crp_files:
        with codecs.open(ifname, 'r', a_encoding) as ifile:
            for iline in ifile:
                iline = iline.strip().lower()
                if not iline or SENT_END_RE.match(iline):
                    continue
                elif iline[0] == ESC_CHAR:
                    if FASTMODE:
                        i += 1
                        if i > 300:
                            break
                    if tokens_seen:
                        tokens_seen = False
                        yield None, None, None
                    continue
                try:
                    iform, itag, ilemma = TAB_RE.split(iline)
                except:
                    print("Invalid line format at line: {:s}".format(
                        repr(iline)), file=sys.stderr
                    )
                    continue
                tokens_seen = True
                yield iform, itag, normalize(ilemma)
        yield None, None, None
Ejemplo n.º 7
0
def _read_files_helper(a_crp_files, a_encoding=ENCODING):
    """Read corpus files and execute specified function.

    @param a_crp_files - files of the original corpus
    @param a_encoding - encoding of the vector file

    @return (Iterator over file lines)

    """
    i = 0
    tokens_seen = False
    for ifname in a_crp_files:
        with codecs.open(ifname, 'r', a_encoding) as ifile:
            for iline in ifile:
                iline = iline.strip().lower()
                if not iline or SENT_END_RE.match(iline):
                    continue
                elif iline[0] == ESC_CHAR:
                    if FASTMODE:
                        i += 1
                        if i > 300:
                            break
                    if tokens_seen:
                        tokens_seen = False
                        yield None, None, None
                    continue
                try:
                    iform, itag, ilemma = TAB_RE.split(iline)
                except:
                    print("Invalid line format at line: {:s}".format(
                        repr(iline)),
                          file=sys.stderr)
                    continue
                tokens_seen = True
                yield iform, itag, normalize(ilemma)
        yield None, None, None
Ejemplo n.º 8
0
def _read_files(a_stat, a_crp_files, a_pos, a_neg, a_neut,
                a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE):
    """Read corpus files and populate one-directional co-occurrences.

    @param a_stat - statistics on term occurrences
    @param a_crp_files - files of the original corpus
    @param a_pos - initial set of positive terms to be expanded
    @param a_neg - initial set of negative terms to be expanded
    @param a_neut - initial set of neutral terms to be expanded
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms

    @return 2-tuple - number of positive and number of negative tweets

    @note modifies `a_stat' in place

    """
    print("Reading corpus...", end="", file=sys.stderr)
    i = 0
    iform = itag = ilemma = ""
    tlemmas = set()
    tweet_stat = [0, 0, 0]
    seeds = a_pos | a_neg | a_neut
    for ifname in a_crp_files:
        with codecs.open(ifname, 'r', ENCODING) as ifile:
            for iline in ifile:
                iline = iline.strip().lower()
                if iline and iline[0] == ESC_CHAR:
                    if FASTMODE:
                        i += 1
                        if i > 300:
                            break
                    _update_stat(a_stat, tweet_stat, tlemmas,
                                 a_pos, a_neg, a_neut,
                                 a_pos_re, a_neg_re)
                    continue
                elif not iline or SENT_END_RE.match(iline):
                    continue
                try:
                    iform, itag, ilemma = TAB_RE.split(iline)
                except:
                    print("Invalid line format at line: {:s}".format(
                        repr(iline)), file=sys.stderr
                    )
                    continue
                ilemma = normalize(ilemma)
                if a_pos_re.search(iform) or a_neg_re.search(iform) \
                   or iform in seeds:
                    tlemmas.add(iform)
                elif a_pos_re.search(ilemma) or a_neg_re.search(ilemma) \
                     or ilemma in seeds:
                    tlemmas.add(ilemma)
                elif itag[:2] not in INFORMATIVE_TAGS \
                        or not check_word(ilemma):
                    continue
                else:
                    tlemmas.add(ilemma)
            _update_stat(a_stat, tweet_stat, tlemmas,
                         a_pos, a_neg, a_neut,
                         a_pos_re, a_neg_re)
    print(" done", file=sys.stderr)
    # remove words with fewer occurrences than the minimum threshold
    _prune_stat(a_stat)
    return tweet_stat
Ejemplo n.º 9
0
def _read_files(a_crp_files,
                a_pos,
                a_neg,
                a_pos_re=NONMATCH_RE,
                a_neg_re=NONMATCH_RE):
    """Read corpus files and populate one-directional co-occurrences.

    @param a_crp_files - files of the original corpus
    @param a_pos - initial set of positive terms
    @param a_neg - initial set of negative terms
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms

    @return (max_vecid, word2vecid, tok_stat)

    @note constructs statistics in place

    """
    print("Reading corpus...", end="", file=sys.stderr)
    i = 0
    prev_lemmas = []
    tok_stat = Counter()
    word2cnt = Counter()
    iform = itag = ilemma = ""
    for ifname in a_crp_files:
        with codecs.open(ifname, 'r', ENCODING) as ifile:
            for iline in ifile:
                iline = iline.strip().lower()
                if not iline or SENT_END_RE.match(iline) \
                   or iline[0] == ESC_CHAR:
                    if FASTMODE and prev_lemmas:
                        i += 1
                        if i > 300:
                            break
                    if prev_lemmas:
                        del prev_lemmas[:]
                    continue
                try:
                    iform, itag, ilemma = TAB_RE.split(iline)
                except:
                    print("Invalid line format at line: {:s}".format(
                        repr(iline)),
                          file=sys.stderr)
                    continue
                ilemma = normalize(ilemma)
                if a_pos_re.search(iform) or a_neg_re.search(iform) \
                   or a_pos_re.search(ilemma) or a_neg_re.search(ilemma):
                    pass
                elif itag[:2] not in INFORMATIVE_TAGS \
                        or not check_word(ilemma):
                    continue
                word2cnt[ilemma] += 1
                for plemma in prev_lemmas:
                    tok_stat[(plemma, ilemma)] += 1
                while len(prev_lemmas) > TOK_WINDOW:
                    prev_lemmas.pop(0)
                prev_lemmas.append(ilemma)
        del prev_lemmas[:]
    print(" done", file=sys.stderr)
    max_vecid = 0
    word2vecid = {}
    # convert words to vector ids if their counters are big enough
    for w, cnt in word2cnt.iteritems():
        if cnt >= MIN_TOK_CNT or w in a_pos or w in a_neg:
            word2vecid[w] = max_vecid
            max_vecid += 1
    word2cnt.clear()
    # convert words to vector ids in context counter
    tok_stat = {(word2vecid[w1], word2vecid[w2]): cnt
                for (w1, w2), cnt in tok_stat.iteritems()
                if w1 in word2vecid and w2 in word2vecid and cnt >= MIN_TOK_CNT
                }
    return (max_vecid, word2vecid, tok_stat)
Ejemplo n.º 10
0
def _read_files(a_crp_files, a_pos, a_neg,
                a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE):
    """Read corpus files and populate one-directional co-occurrences.

    @param a_crp_files - files of the original corpus
    @param a_pos - initial set of positive terms
    @param a_neg - initial set of negative terms
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms

    @return (max_vecid, word2vecid, tok_stat)

    @note constructs statistics in place

    """
    print("Reading corpus...", end="", file=sys.stderr)
    i = 0
    prev_lemmas = []
    tok_stat = Counter()
    word2cnt = Counter()
    iform = itag = ilemma = ""
    for ifname in a_crp_files:
        with codecs.open(ifname, 'r', ENCODING) as ifile:
            for iline in ifile:
                iline = iline.strip().lower()
                if not iline or SENT_END_RE.match(iline) \
                   or iline[0] == ESC_CHAR:
                    if FASTMODE and prev_lemmas:
                        i += 1
                        if i > 300:
                            break
                    if prev_lemmas:
                        del prev_lemmas[:]
                    continue
                try:
                    iform, itag, ilemma = TAB_RE.split(iline)
                except:
                    print("Invalid line format at line: {:s}".format(
                        repr(iline)), file=sys.stderr
                    )
                    continue
                ilemma = normalize(ilemma)
                if a_pos_re.search(iform) or a_neg_re.search(iform) \
                   or a_pos_re.search(ilemma) or a_neg_re.search(ilemma):
                    pass
                elif itag[:2] not in INFORMATIVE_TAGS \
                        or not check_word(ilemma):
                    continue
                word2cnt[ilemma] += 1
                for plemma in prev_lemmas:
                    tok_stat[(plemma, ilemma)] += 1
                while len(prev_lemmas) > TOK_WINDOW:
                    prev_lemmas.pop(0)
                prev_lemmas.append(ilemma)
        del prev_lemmas[:]
    print(" done", file=sys.stderr)
    max_vecid = 0
    word2vecid = {}
    # convert words to vector ids if their counters are big enough
    for w, cnt in word2cnt.iteritems():
        if cnt >= MIN_TOK_CNT or w in a_pos or w in a_neg:
            word2vecid[w] = max_vecid
            max_vecid += 1
    word2cnt.clear()
    # convert words to vector ids in context counter
    tok_stat = {(word2vecid[w1], word2vecid[w2]): cnt
                for (w1, w2), cnt in tok_stat.iteritems()
                if w1 in word2vecid and w2 in word2vecid
                and cnt >= MIN_TOK_CNT
                }
    return (max_vecid, word2vecid, tok_stat)
Ejemplo n.º 11
0
def _read_files(a_crp_files, a_pos, a_neg, a_neut,
                a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE):
    """Read corpus files and populate one-directional co-occurrences.

    @param a_crp_files - files of the original corpus
    @param a_pos - initial set of positive terms to be expanded
    @param a_neg - initial set of negative terms to be expanded
    @param a_neut - initial set of neutral terms to be expanded
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms

    @return 2-tuple - training sets of features and their gold classes

    """
    print("Reading corpus...", end="", file=sys.stderr)
    i = 0
    ts_x = []
    ts_y = []
    tweet_toks = set()
    iform = itag = ilemma = prev_lemma = ""
    for ifname in a_crp_files:
        with codecs.open(ifname, 'r', ENCODING) as ifile:
            prev_lemma = ""
            for iline in ifile:
                iline = iline.strip().lower()
                if iline and iline[0] == ESC_CHAR:
                    if FASTMODE:
                        i += 1
                        if i > 300:
                            break
                    _update_ts(ts_x, ts_y, tweet_toks,
                               a_pos, a_neg, a_neut, a_pos_re, a_neg_re)
                    prev_lemma = ""
                    continue
                elif not iline or SENT_END_RE.match(iline):
                    prev_lemma = ""
                    continue
                try:
                    iform, itag, ilemma = TAB_RE.split(iline)
                except:
                    print("Invalid line format at line: {:s}".format(
                        repr(iline)), file=sys.stderr
                    )
                    continue
                ilemma = normalize(ilemma)
                if a_pos_re.search(iform) or a_neg_re.search(iform):
                    tweet_toks.add(iform)
                elif a_pos_re.search(ilemma) or a_neg_re.search(ilemma):
                    tweet_toks.add(ilemma)
                elif itag[:2] not in INFORMATIVE_TAGS \
                        or not check_word(ilemma):
                    continue
                else:
                    tweet_toks.add(ilemma)
                if prev_lemma:
                    tweet_toks.add((prev_lemma, ilemma))
                prev_lemma = ilemma
            _update_ts(ts_x, ts_y, tweet_toks,
                       a_pos, a_neg, a_neut, a_pos_re, a_neg_re)
    print(" done", file=sys.stderr)
    return _prune_ts(ts_x, ts_y)