Ejemplo n.º 1
0
def _tkm_add_corpus(ising, a_cc_file):
    """Add lexical nodes from corpus to the Ising spin model

    @param a_ising - instance of the Ising spin model
    @param a_cc_file - file containing conjoined word pairs extracted from
      corpus

    @return \c void

    """
    ifields = []
    iwght = 1.
    ilemma1 = ilemma2 = ""
    with codecs.open(a_cc_file, 'r', ENCODING) as ifile:
        for iline in ifile:
            iline = iline.strip()
            if not iline:
                continue
            ifields = TAB_RE.split(iline)
            if len(ifields) != 3:
                continue
            ilemma1, ilemma2, iwght = ifields
            if ilemma1 in FORM2LEMMA:
                ilemma1 = FORM2LEMMA[ilemma1]
            if ilemma2 in FORM2LEMMA:
                ilemma2 = FORM2LEMMA[ilemma2]
            if check_word(ilemma1) and check_word(ilemma2):
                ising.add_edge(normalize(ilemma1),
                               normalize(ilemma2),
                               float(iwght),
                               a_add_missing=True)
Ejemplo n.º 2
0
def _read_files(a_stat, a_crp_files, a_pos, a_neg, a_neut,
                a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE):
    """Read corpus files and populate one-directional co-occurrences.

    @param a_stat - statistics on term occurrences
    @param a_crp_files - files of the original corpus
    @param a_pos - initial set of positive terms to be expanded
    @param a_neg - initial set of negative terms to be expanded
    @param a_neut - initial set of neutral terms to be expanded
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms

    @return 2-tuple - number of positive and number of negative tweets

    @note modifies `a_stat' in place

    """
    print("Reading corpus...", end="", file=sys.stderr)
    i = 0
    iform = itag = ilemma = ""
    tlemmas = set()
    tweet_stat = [0, 0, 0]
    seeds = a_pos | a_neg | a_neut
    for ifname in a_crp_files:
        with codecs.open(ifname, 'r', ENCODING) as ifile:
            for iline in ifile:
                iline = iline.strip().lower()
                if iline and iline[0] == ESC_CHAR:
                    if FASTMODE:
                        i += 1
                        if i > 300:
                            break
                    _update_stat(a_stat, tweet_stat, tlemmas,
                                 a_pos, a_neg, a_neut,
                                 a_pos_re, a_neg_re)
                    continue
                elif not iline or SENT_END_RE.match(iline):
                    continue
                try:
                    iform, itag, ilemma = TAB_RE.split(iline)
                except:
                    print("Invalid line format at line: {:s}".format(
                        repr(iline)), file=sys.stderr
                    )
                    continue
                ilemma = normalize(ilemma)
                if a_pos_re.search(iform) or a_neg_re.search(iform) \
                   or iform in seeds:
                    tlemmas.add(iform)
                elif a_pos_re.search(ilemma) or a_neg_re.search(ilemma) \
                     or ilemma in seeds:
                    tlemmas.add(ilemma)
                elif itag[:2] not in INFORMATIVE_TAGS \
                        or not check_word(ilemma):
                    continue
                else:
                    tlemmas.add(ilemma)
            _update_stat(a_stat, tweet_stat, tlemmas,
                         a_pos, a_neg, a_neut,
                         a_pos_re, a_neg_re)
    print(" done", file=sys.stderr)
    # remove words with fewer occurrences than the minimum threshold
    _prune_stat(a_stat)
    return tweet_stat
Ejemplo n.º 3
0
def _read_files(a_crp_files, a_pos, a_neg, a_neut,
                a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE,
                a_encoding=ENCODING):
    """Read corpus files and populate one-directional co-occurrences.

    @param a_crp_files - files of the original corpus
    @param a_pos - initial set of positive terms
    @param a_neg - initial set of negative terms
    @param a_neut - initial set of neutral terms
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms
    @param a_encoding - encoding of the vector file

    @return (word2vecid, x, y)

    @note constructs statistics in place

    """
    print("Populating corpus statistics...",
          end="", file=sys.stderr)
    word2cnt = Counter(ilemma
                       for _, itag, ilemma in _read_files_helper(a_crp_files,
                                                                 a_encoding)
                       if ilemma is not None and itag[:2] in INFORMATIVE_TAGS
                       and check_word(ilemma))
    print(" done", file=sys.stderr)
    word2vecid = {UNK: UNK_I}
    for w in chain(a_pos, a_neg, a_neut):
        word2vecid[w] = len(word2vecid)
    # convert words to vector ids if their counters are big enough
    for w, cnt in word2cnt.iteritems():
        if cnt >= MIN_TOK_CNT or a_pos_re.search(w) or a_neg_re.search(w):
            word2vecid[w] = len(word2vecid)
    word2cnt.clear()

    # generate the training set
    def check_in_seeds(a_form, a_lemma, a_seeds, a_seed_re):
        if a_seed_re.search(a_form) or a_seed_re.search(a_lemma) \
           or a_form in a_seeds or normalize(a_form) in a_seeds \
           or a_lemma in a_seeds:
            return True
        return False

    max_sent_len = 0
    X = []
    Y = []
    toks = []
    label = None
    for iform, itag, ilemma in _read_files_helper(a_crp_files):
        if ilemma is None:
            if toks:
                if label is not None:
                    max_sent_len = max(max_sent_len, len(toks))
                    X.append(deepcopy(toks))
                    Y.append(label)
                del toks[:]
                label = None
            continue
        if ilemma in word2vecid:
            toks.append(word2vecid[ilemma])
        if check_in_seeds(iform, ilemma, a_pos, a_pos_re):
            label = POSITIVE_IDX
        elif check_in_seeds(iform, ilemma, a_neg, a_neg_re):
            label = NEGATIVE_IDX
        elif label is None and check_in_seeds(iform, ilemma,
                                              a_neut, NONMATCH_RE):
            label = NEUTRAL_IDX
    X = np.array(
        [x + [UNK_I] * (max_sent_len - len(x))
         for x in X], dtype="int32")
    Y = np.array(Y, dtype="int32")
    return (word2vecid, max_sent_len, X, Y)
Ejemplo n.º 4
0
def _read_files(a_crp_files,
                a_pos,
                a_neg,
                a_pos_re=NONMATCH_RE,
                a_neg_re=NONMATCH_RE):
    """Read corpus files and populate one-directional co-occurrences.

    @param a_crp_files - files of the original corpus
    @param a_pos - initial set of positive terms
    @param a_neg - initial set of negative terms
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms

    @return (max_vecid, word2vecid, tok_stat)

    @note constructs statistics in place

    """
    print("Reading corpus...", end="", file=sys.stderr)
    i = 0
    prev_lemmas = []
    tok_stat = Counter()
    word2cnt = Counter()
    iform = itag = ilemma = ""
    for ifname in a_crp_files:
        with codecs.open(ifname, 'r', ENCODING) as ifile:
            for iline in ifile:
                iline = iline.strip().lower()
                if not iline or SENT_END_RE.match(iline) \
                   or iline[0] == ESC_CHAR:
                    if FASTMODE and prev_lemmas:
                        i += 1
                        if i > 300:
                            break
                    if prev_lemmas:
                        del prev_lemmas[:]
                    continue
                try:
                    iform, itag, ilemma = TAB_RE.split(iline)
                except:
                    print("Invalid line format at line: {:s}".format(
                        repr(iline)),
                          file=sys.stderr)
                    continue
                ilemma = normalize(ilemma)
                if a_pos_re.search(iform) or a_neg_re.search(iform) \
                   or a_pos_re.search(ilemma) or a_neg_re.search(ilemma):
                    pass
                elif itag[:2] not in INFORMATIVE_TAGS \
                        or not check_word(ilemma):
                    continue
                word2cnt[ilemma] += 1
                for plemma in prev_lemmas:
                    tok_stat[(plemma, ilemma)] += 1
                while len(prev_lemmas) > TOK_WINDOW:
                    prev_lemmas.pop(0)
                prev_lemmas.append(ilemma)
        del prev_lemmas[:]
    print(" done", file=sys.stderr)
    max_vecid = 0
    word2vecid = {}
    # convert words to vector ids if their counters are big enough
    for w, cnt in word2cnt.iteritems():
        if cnt >= MIN_TOK_CNT or w in a_pos or w in a_neg:
            word2vecid[w] = max_vecid
            max_vecid += 1
    word2cnt.clear()
    # convert words to vector ids in context counter
    tok_stat = {(word2vecid[w1], word2vecid[w2]): cnt
                for (w1, w2), cnt in tok_stat.iteritems()
                if w1 in word2vecid and w2 in word2vecid and cnt >= MIN_TOK_CNT
                }
    return (max_vecid, word2vecid, tok_stat)
Ejemplo n.º 5
0
def _read_files(a_crp_files, a_pos, a_neg,
                a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE):
    """Read corpus files and populate one-directional co-occurrences.

    @param a_crp_files - files of the original corpus
    @param a_pos - initial set of positive terms
    @param a_neg - initial set of negative terms
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms

    @return (max_vecid, word2vecid, tok_stat)

    @note constructs statistics in place

    """
    print("Reading corpus...", end="", file=sys.stderr)
    i = 0
    prev_lemmas = []
    tok_stat = Counter()
    word2cnt = Counter()
    iform = itag = ilemma = ""
    for ifname in a_crp_files:
        with codecs.open(ifname, 'r', ENCODING) as ifile:
            for iline in ifile:
                iline = iline.strip().lower()
                if not iline or SENT_END_RE.match(iline) \
                   or iline[0] == ESC_CHAR:
                    if FASTMODE and prev_lemmas:
                        i += 1
                        if i > 300:
                            break
                    if prev_lemmas:
                        del prev_lemmas[:]
                    continue
                try:
                    iform, itag, ilemma = TAB_RE.split(iline)
                except:
                    print("Invalid line format at line: {:s}".format(
                        repr(iline)), file=sys.stderr
                    )
                    continue
                ilemma = normalize(ilemma)
                if a_pos_re.search(iform) or a_neg_re.search(iform) \
                   or a_pos_re.search(ilemma) or a_neg_re.search(ilemma):
                    pass
                elif itag[:2] not in INFORMATIVE_TAGS \
                        or not check_word(ilemma):
                    continue
                word2cnt[ilemma] += 1
                for plemma in prev_lemmas:
                    tok_stat[(plemma, ilemma)] += 1
                while len(prev_lemmas) > TOK_WINDOW:
                    prev_lemmas.pop(0)
                prev_lemmas.append(ilemma)
        del prev_lemmas[:]
    print(" done", file=sys.stderr)
    max_vecid = 0
    word2vecid = {}
    # convert words to vector ids if their counters are big enough
    for w, cnt in word2cnt.iteritems():
        if cnt >= MIN_TOK_CNT or w in a_pos or w in a_neg:
            word2vecid[w] = max_vecid
            max_vecid += 1
    word2cnt.clear()
    # convert words to vector ids in context counter
    tok_stat = {(word2vecid[w1], word2vecid[w2]): cnt
                for (w1, w2), cnt in tok_stat.iteritems()
                if w1 in word2vecid and w2 in word2vecid
                and cnt >= MIN_TOK_CNT
                }
    return (max_vecid, word2vecid, tok_stat)
Ejemplo n.º 6
0
def _read_files(a_crp_files,
                a_pos,
                a_neg,
                a_neut,
                a_pos_re=NONMATCH_RE,
                a_neg_re=NONMATCH_RE,
                a_encoding=ENCODING):
    """Read corpus files and populate one-directional co-occurrences.

    @param a_crp_files - files of the original corpus
    @param a_pos - initial set of positive terms
    @param a_neg - initial set of negative terms
    @param a_neut - initial set of neutral terms
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms
    @param a_encoding - encoding of the vector file

    @return (word2vecid, x, y)

    @note constructs statistics in place

    """
    print("Populating corpus statistics...", end="", file=sys.stderr)
    word2cnt = Counter(
        ilemma
        for _, itag, ilemma in _read_files_helper(a_crp_files, a_encoding)
        if ilemma is not None and itag[:2] in INFORMATIVE_TAGS
        and check_word(ilemma))
    print(" done", file=sys.stderr)
    word2vecid = {UNK: UNK_I}
    for w in chain(a_pos, a_neg, a_neut):
        word2vecid[w] = len(word2vecid)
    # convert words to vector ids if their counters are big enough
    for w, cnt in word2cnt.iteritems():
        if cnt >= MIN_TOK_CNT or a_pos_re.search(w) or a_neg_re.search(w):
            word2vecid[w] = len(word2vecid)
    word2cnt.clear()

    # generate the training set
    def check_in_seeds(a_form, a_lemma, a_seeds, a_seed_re):
        if a_seed_re.search(a_form) or a_seed_re.search(a_lemma) \
           or a_form in a_seeds or normalize(a_form) in a_seeds \
           or a_lemma in a_seeds:
            return True
        return False

    max_sent_len = 0
    X = []
    Y = []
    toks = []
    label = None
    for iform, itag, ilemma in _read_files_helper(a_crp_files):
        if ilemma is None:
            if toks:
                if label is not None:
                    max_sent_len = max(max_sent_len, len(toks))
                    X.append(deepcopy(toks))
                    Y.append(label)
                del toks[:]
                label = None
            continue
        if ilemma in word2vecid:
            toks.append(word2vecid[ilemma])
        if check_in_seeds(iform, ilemma, a_pos, a_pos_re):
            label = POSITIVE_IDX
        elif check_in_seeds(iform, ilemma, a_neg, a_neg_re):
            label = NEGATIVE_IDX
        elif label is None and check_in_seeds(iform, ilemma, a_neut,
                                              NONMATCH_RE):
            label = NEUTRAL_IDX
    X = np.array([x + [UNK_I] * (max_sent_len - len(x)) for x in X],
                 dtype="int32")
    Y = np.array(Y, dtype="int32")
    return (word2vecid, max_sent_len, X, Y)
Ejemplo n.º 7
0
def _read_files(a_crp_files, a_pos, a_neg, a_neut,
                a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE):
    """Read corpus files and populate one-directional co-occurrences.

    @param a_crp_files - files of the original corpus
    @param a_pos - initial set of positive terms to be expanded
    @param a_neg - initial set of negative terms to be expanded
    @param a_neut - initial set of neutral terms to be expanded
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms

    @return 2-tuple - training sets of features and their gold classes

    """
    print("Reading corpus...", end="", file=sys.stderr)
    i = 0
    ts_x = []
    ts_y = []
    tweet_toks = set()
    iform = itag = ilemma = prev_lemma = ""
    for ifname in a_crp_files:
        with codecs.open(ifname, 'r', ENCODING) as ifile:
            prev_lemma = ""
            for iline in ifile:
                iline = iline.strip().lower()
                if iline and iline[0] == ESC_CHAR:
                    if FASTMODE:
                        i += 1
                        if i > 300:
                            break
                    _update_ts(ts_x, ts_y, tweet_toks,
                               a_pos, a_neg, a_neut, a_pos_re, a_neg_re)
                    prev_lemma = ""
                    continue
                elif not iline or SENT_END_RE.match(iline):
                    prev_lemma = ""
                    continue
                try:
                    iform, itag, ilemma = TAB_RE.split(iline)
                except:
                    print("Invalid line format at line: {:s}".format(
                        repr(iline)), file=sys.stderr
                    )
                    continue
                ilemma = normalize(ilemma)
                if a_pos_re.search(iform) or a_neg_re.search(iform):
                    tweet_toks.add(iform)
                elif a_pos_re.search(ilemma) or a_neg_re.search(ilemma):
                    tweet_toks.add(ilemma)
                elif itag[:2] not in INFORMATIVE_TAGS \
                        or not check_word(ilemma):
                    continue
                else:
                    tweet_toks.add(ilemma)
                if prev_lemma:
                    tweet_toks.add((prev_lemma, ilemma))
                prev_lemma = ilemma
            _update_ts(ts_x, ts_y, tweet_toks,
                       a_pos, a_neg, a_neut, a_pos_re, a_neg_re)
    print(" done", file=sys.stderr)
    return _prune_ts(ts_x, ts_y)