def _tkm_add_corpus(ising, a_cc_file): """Add lexical nodes from corpus to the Ising spin model @param a_ising - instance of the Ising spin model @param a_cc_file - file containing conjoined word pairs extracted from corpus @return \c void """ ifields = [] iwght = 1. ilemma1 = ilemma2 = "" with codecs.open(a_cc_file, 'r', ENCODING) as ifile: for iline in ifile: iline = iline.strip() if not iline: continue ifields = TAB_RE.split(iline) if len(ifields) != 3: continue ilemma1, ilemma2, iwght = ifields if ilemma1 in FORM2LEMMA: ilemma1 = FORM2LEMMA[ilemma1] if ilemma2 in FORM2LEMMA: ilemma2 = FORM2LEMMA[ilemma2] if check_word(ilemma1) and check_word(ilemma2): ising.add_edge(normalize(ilemma1), normalize(ilemma2), float(iwght), a_add_missing=True)
def _read_files(a_stat, a_crp_files, a_pos, a_neg, a_neut, a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE): """Read corpus files and populate one-directional co-occurrences. @param a_stat - statistics on term occurrences @param a_crp_files - files of the original corpus @param a_pos - initial set of positive terms to be expanded @param a_neg - initial set of negative terms to be expanded @param a_neut - initial set of neutral terms to be expanded @param a_pos_re - regular expression for matching positive terms @param a_neg_re - regular expression for matching negative terms @return 2-tuple - number of positive and number of negative tweets @note modifies `a_stat' in place """ print("Reading corpus...", end="", file=sys.stderr) i = 0 iform = itag = ilemma = "" tlemmas = set() tweet_stat = [0, 0, 0] seeds = a_pos | a_neg | a_neut for ifname in a_crp_files: with codecs.open(ifname, 'r', ENCODING) as ifile: for iline in ifile: iline = iline.strip().lower() if iline and iline[0] == ESC_CHAR: if FASTMODE: i += 1 if i > 300: break _update_stat(a_stat, tweet_stat, tlemmas, a_pos, a_neg, a_neut, a_pos_re, a_neg_re) continue elif not iline or SENT_END_RE.match(iline): continue try: iform, itag, ilemma = TAB_RE.split(iline) except: print("Invalid line format at line: {:s}".format( repr(iline)), file=sys.stderr ) continue ilemma = normalize(ilemma) if a_pos_re.search(iform) or a_neg_re.search(iform) \ or iform in seeds: tlemmas.add(iform) elif a_pos_re.search(ilemma) or a_neg_re.search(ilemma) \ or ilemma in seeds: tlemmas.add(ilemma) elif itag[:2] not in INFORMATIVE_TAGS \ or not check_word(ilemma): continue else: tlemmas.add(ilemma) _update_stat(a_stat, tweet_stat, tlemmas, a_pos, a_neg, a_neut, a_pos_re, a_neg_re) print(" done", file=sys.stderr) # remove words with fewer occurrences than the minimum threshold _prune_stat(a_stat) return tweet_stat
def _read_files(a_crp_files, a_pos, a_neg, a_neut, a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE, a_encoding=ENCODING): """Read corpus files and populate one-directional co-occurrences. @param a_crp_files - files of the original corpus @param a_pos - initial set of positive terms @param a_neg - initial set of negative terms @param a_neut - initial set of neutral terms @param a_pos_re - regular expression for matching positive terms @param a_neg_re - regular expression for matching negative terms @param a_encoding - encoding of the vector file @return (word2vecid, x, y) @note constructs statistics in place """ print("Populating corpus statistics...", end="", file=sys.stderr) word2cnt = Counter(ilemma for _, itag, ilemma in _read_files_helper(a_crp_files, a_encoding) if ilemma is not None and itag[:2] in INFORMATIVE_TAGS and check_word(ilemma)) print(" done", file=sys.stderr) word2vecid = {UNK: UNK_I} for w in chain(a_pos, a_neg, a_neut): word2vecid[w] = len(word2vecid) # convert words to vector ids if their counters are big enough for w, cnt in word2cnt.iteritems(): if cnt >= MIN_TOK_CNT or a_pos_re.search(w) or a_neg_re.search(w): word2vecid[w] = len(word2vecid) word2cnt.clear() # generate the training set def check_in_seeds(a_form, a_lemma, a_seeds, a_seed_re): if a_seed_re.search(a_form) or a_seed_re.search(a_lemma) \ or a_form in a_seeds or normalize(a_form) in a_seeds \ or a_lemma in a_seeds: return True return False max_sent_len = 0 X = [] Y = [] toks = [] label = None for iform, itag, ilemma in _read_files_helper(a_crp_files): if ilemma is None: if toks: if label is not None: max_sent_len = max(max_sent_len, len(toks)) X.append(deepcopy(toks)) Y.append(label) del toks[:] label = None continue if ilemma in word2vecid: toks.append(word2vecid[ilemma]) if check_in_seeds(iform, ilemma, a_pos, a_pos_re): label = POSITIVE_IDX elif check_in_seeds(iform, ilemma, a_neg, a_neg_re): label = NEGATIVE_IDX elif label is None and check_in_seeds(iform, ilemma, a_neut, NONMATCH_RE): label = NEUTRAL_IDX X = np.array( [x + [UNK_I] * (max_sent_len - len(x)) for x in X], dtype="int32") Y = np.array(Y, dtype="int32") return (word2vecid, max_sent_len, X, Y)
def _read_files(a_crp_files, a_pos, a_neg, a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE): """Read corpus files and populate one-directional co-occurrences. @param a_crp_files - files of the original corpus @param a_pos - initial set of positive terms @param a_neg - initial set of negative terms @param a_pos_re - regular expression for matching positive terms @param a_neg_re - regular expression for matching negative terms @return (max_vecid, word2vecid, tok_stat) @note constructs statistics in place """ print("Reading corpus...", end="", file=sys.stderr) i = 0 prev_lemmas = [] tok_stat = Counter() word2cnt = Counter() iform = itag = ilemma = "" for ifname in a_crp_files: with codecs.open(ifname, 'r', ENCODING) as ifile: for iline in ifile: iline = iline.strip().lower() if not iline or SENT_END_RE.match(iline) \ or iline[0] == ESC_CHAR: if FASTMODE and prev_lemmas: i += 1 if i > 300: break if prev_lemmas: del prev_lemmas[:] continue try: iform, itag, ilemma = TAB_RE.split(iline) except: print("Invalid line format at line: {:s}".format( repr(iline)), file=sys.stderr) continue ilemma = normalize(ilemma) if a_pos_re.search(iform) or a_neg_re.search(iform) \ or a_pos_re.search(ilemma) or a_neg_re.search(ilemma): pass elif itag[:2] not in INFORMATIVE_TAGS \ or not check_word(ilemma): continue word2cnt[ilemma] += 1 for plemma in prev_lemmas: tok_stat[(plemma, ilemma)] += 1 while len(prev_lemmas) > TOK_WINDOW: prev_lemmas.pop(0) prev_lemmas.append(ilemma) del prev_lemmas[:] print(" done", file=sys.stderr) max_vecid = 0 word2vecid = {} # convert words to vector ids if their counters are big enough for w, cnt in word2cnt.iteritems(): if cnt >= MIN_TOK_CNT or w in a_pos or w in a_neg: word2vecid[w] = max_vecid max_vecid += 1 word2cnt.clear() # convert words to vector ids in context counter tok_stat = {(word2vecid[w1], word2vecid[w2]): cnt for (w1, w2), cnt in tok_stat.iteritems() if w1 in word2vecid and w2 in word2vecid and cnt >= MIN_TOK_CNT } return (max_vecid, word2vecid, tok_stat)
def _read_files(a_crp_files, a_pos, a_neg, a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE): """Read corpus files and populate one-directional co-occurrences. @param a_crp_files - files of the original corpus @param a_pos - initial set of positive terms @param a_neg - initial set of negative terms @param a_pos_re - regular expression for matching positive terms @param a_neg_re - regular expression for matching negative terms @return (max_vecid, word2vecid, tok_stat) @note constructs statistics in place """ print("Reading corpus...", end="", file=sys.stderr) i = 0 prev_lemmas = [] tok_stat = Counter() word2cnt = Counter() iform = itag = ilemma = "" for ifname in a_crp_files: with codecs.open(ifname, 'r', ENCODING) as ifile: for iline in ifile: iline = iline.strip().lower() if not iline or SENT_END_RE.match(iline) \ or iline[0] == ESC_CHAR: if FASTMODE and prev_lemmas: i += 1 if i > 300: break if prev_lemmas: del prev_lemmas[:] continue try: iform, itag, ilemma = TAB_RE.split(iline) except: print("Invalid line format at line: {:s}".format( repr(iline)), file=sys.stderr ) continue ilemma = normalize(ilemma) if a_pos_re.search(iform) or a_neg_re.search(iform) \ or a_pos_re.search(ilemma) or a_neg_re.search(ilemma): pass elif itag[:2] not in INFORMATIVE_TAGS \ or not check_word(ilemma): continue word2cnt[ilemma] += 1 for plemma in prev_lemmas: tok_stat[(plemma, ilemma)] += 1 while len(prev_lemmas) > TOK_WINDOW: prev_lemmas.pop(0) prev_lemmas.append(ilemma) del prev_lemmas[:] print(" done", file=sys.stderr) max_vecid = 0 word2vecid = {} # convert words to vector ids if their counters are big enough for w, cnt in word2cnt.iteritems(): if cnt >= MIN_TOK_CNT or w in a_pos or w in a_neg: word2vecid[w] = max_vecid max_vecid += 1 word2cnt.clear() # convert words to vector ids in context counter tok_stat = {(word2vecid[w1], word2vecid[w2]): cnt for (w1, w2), cnt in tok_stat.iteritems() if w1 in word2vecid and w2 in word2vecid and cnt >= MIN_TOK_CNT } return (max_vecid, word2vecid, tok_stat)
def _read_files(a_crp_files, a_pos, a_neg, a_neut, a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE, a_encoding=ENCODING): """Read corpus files and populate one-directional co-occurrences. @param a_crp_files - files of the original corpus @param a_pos - initial set of positive terms @param a_neg - initial set of negative terms @param a_neut - initial set of neutral terms @param a_pos_re - regular expression for matching positive terms @param a_neg_re - regular expression for matching negative terms @param a_encoding - encoding of the vector file @return (word2vecid, x, y) @note constructs statistics in place """ print("Populating corpus statistics...", end="", file=sys.stderr) word2cnt = Counter( ilemma for _, itag, ilemma in _read_files_helper(a_crp_files, a_encoding) if ilemma is not None and itag[:2] in INFORMATIVE_TAGS and check_word(ilemma)) print(" done", file=sys.stderr) word2vecid = {UNK: UNK_I} for w in chain(a_pos, a_neg, a_neut): word2vecid[w] = len(word2vecid) # convert words to vector ids if their counters are big enough for w, cnt in word2cnt.iteritems(): if cnt >= MIN_TOK_CNT or a_pos_re.search(w) or a_neg_re.search(w): word2vecid[w] = len(word2vecid) word2cnt.clear() # generate the training set def check_in_seeds(a_form, a_lemma, a_seeds, a_seed_re): if a_seed_re.search(a_form) or a_seed_re.search(a_lemma) \ or a_form in a_seeds or normalize(a_form) in a_seeds \ or a_lemma in a_seeds: return True return False max_sent_len = 0 X = [] Y = [] toks = [] label = None for iform, itag, ilemma in _read_files_helper(a_crp_files): if ilemma is None: if toks: if label is not None: max_sent_len = max(max_sent_len, len(toks)) X.append(deepcopy(toks)) Y.append(label) del toks[:] label = None continue if ilemma in word2vecid: toks.append(word2vecid[ilemma]) if check_in_seeds(iform, ilemma, a_pos, a_pos_re): label = POSITIVE_IDX elif check_in_seeds(iform, ilemma, a_neg, a_neg_re): label = NEGATIVE_IDX elif label is None and check_in_seeds(iform, ilemma, a_neut, NONMATCH_RE): label = NEUTRAL_IDX X = np.array([x + [UNK_I] * (max_sent_len - len(x)) for x in X], dtype="int32") Y = np.array(Y, dtype="int32") return (word2vecid, max_sent_len, X, Y)
def _read_files(a_crp_files, a_pos, a_neg, a_neut, a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE): """Read corpus files and populate one-directional co-occurrences. @param a_crp_files - files of the original corpus @param a_pos - initial set of positive terms to be expanded @param a_neg - initial set of negative terms to be expanded @param a_neut - initial set of neutral terms to be expanded @param a_pos_re - regular expression for matching positive terms @param a_neg_re - regular expression for matching negative terms @return 2-tuple - training sets of features and their gold classes """ print("Reading corpus...", end="", file=sys.stderr) i = 0 ts_x = [] ts_y = [] tweet_toks = set() iform = itag = ilemma = prev_lemma = "" for ifname in a_crp_files: with codecs.open(ifname, 'r', ENCODING) as ifile: prev_lemma = "" for iline in ifile: iline = iline.strip().lower() if iline and iline[0] == ESC_CHAR: if FASTMODE: i += 1 if i > 300: break _update_ts(ts_x, ts_y, tweet_toks, a_pos, a_neg, a_neut, a_pos_re, a_neg_re) prev_lemma = "" continue elif not iline or SENT_END_RE.match(iline): prev_lemma = "" continue try: iform, itag, ilemma = TAB_RE.split(iline) except: print("Invalid line format at line: {:s}".format( repr(iline)), file=sys.stderr ) continue ilemma = normalize(ilemma) if a_pos_re.search(iform) or a_neg_re.search(iform): tweet_toks.add(iform) elif a_pos_re.search(ilemma) or a_neg_re.search(ilemma): tweet_toks.add(ilemma) elif itag[:2] not in INFORMATIVE_TAGS \ or not check_word(ilemma): continue else: tweet_toks.add(ilemma) if prev_lemma: tweet_toks.add((prev_lemma, ilemma)) prev_lemma = ilemma _update_ts(ts_x, ts_y, tweet_toks, a_pos, a_neg, a_neut, a_pos_re, a_neg_re) print(" done", file=sys.stderr) return _prune_ts(ts_x, ts_y)