def _get_form2lemma(a_fname): """Read file containing form/lemma correspodences @param a_fname - name of input file @return void (correspondences are read into global variables) """ global STOP_WORDS, FORM2LEMMA if not os.path.isfile(a_fname) or not os.access(a_fname, os.R_OK): raise RuntimeError("Cannot read from file '{:s}'".format( a_fname)) iform = itag = ilemma = "" with codecs.open(a_fname, 'r', encoding=ENCODING) as ifile: for iline in ifile: iline = iline.strip() if not iline: continue iform, itag, ilemma = TAB_RE.split(iline) iform = normalize(iform) if len(itag) > 1 and itag[:2] in INFORMATIVE_TAGS: FORM2LEMMA[iform] = normalize(ilemma) else: STOP_WORDS.add(iform)
def _tkm_add_corpus(ising, a_cc_file): """Add lexical nodes from corpus to the Ising spin model @param a_ising - instance of the Ising spin model @param a_cc_file - file containing conjoined word pairs extracted from corpus @return \c void """ ifields = [] iwght = 1. ilemma1 = ilemma2 = "" with codecs.open(a_cc_file, 'r', ENCODING) as ifile: for iline in ifile: iline = iline.strip() if not iline: continue ifields = TAB_RE.split(iline) if len(ifields) != 3: continue ilemma1, ilemma2, iwght = ifields if ilemma1 in FORM2LEMMA: ilemma1 = FORM2LEMMA[ilemma1] if ilemma2 in FORM2LEMMA: ilemma2 = FORM2LEMMA[ilemma2] if check_word(ilemma1) and check_word(ilemma2): ising.add_edge(normalize(ilemma1), normalize(ilemma2), float(iwght), a_add_missing=True)
def _get_form2lemma(a_fname): """Read file containing form/lemma correspodences @param a_fname - name of input file @return void (correspondences are read into global variables) """ global STOP_WORDS, FORM2LEMMA if not os.path.isfile(a_fname) or not os.access(a_fname, os.R_OK): raise RuntimeError("Cannot read from file '{:s}'".format(a_fname)) iform = itag = ilemma = "" with codecs.open(a_fname, 'r', encoding=ENCODING) as ifile: for iline in ifile: iline = iline.strip() if not iline: continue iform, itag, ilemma = TAB_RE.split(iline) iform = normalize(iform) if len(itag) > 1 and itag[:2] in INFORMATIVE_TAGS: FORM2LEMMA[iform] = normalize(ilemma) else: STOP_WORDS.add(iform)
def _read_set(a_fname): """Read initial seed set of terms. @param a_fname - name of input file containing terms @return void """ global POS_SET, NEG_SET, NEUT_SET, POS_RE, NEG_RE fields = [] pos_regs = [] neg_regs = [] with codecs.open(a_fname, 'r', encoding=ENCODING) as ifile: for iline in ifile: iline = iline.strip() if not iline: continue elif iline.startswith(COMMENT): # maybe, we will later introduce some special comments continue fields = TAB_RE.split(iline) if len(fields) > 2 and fields[2] == REGEXP: if fields[1] == POSITIVE: pos_regs.append(normalize_reg(fields[0])) elif fields[1] == NEGATIVE: neg_regs.append(normalize_reg(fields[0])) else: raise NotImplementedError( "Regular expressions are not supported" " for non-polar classes.") continue if fields[1] == POSITIVE: POS_SET.add(normalize(fields[0])) elif fields[1] == NEGATIVE: NEG_SET.add(normalize(fields[0])) elif fields[1] == NEUTRAL: NEUT_SET.add(normalize(fields[0])) else: raise RuntimeError( "Unknown field specification: {:s}".format(fields[-1])) if pos_regs: POS_RE = join_regs(pos_regs) if neg_regs: NEG_RE = join_regs(neg_regs)
def _read_set(a_fname): """Read initial seed set of terms. @param a_fname - name of input file containing terms @return void """ global POS_SET, NEG_SET, NEUT_SET, POS_RE, NEG_RE fields = [] pos_regs = [] neg_regs = [] with codecs.open(a_fname, 'r', encoding=ENCODING) as ifile: for iline in ifile: iline = iline.strip() if not iline: continue elif iline.startswith(COMMENT): # maybe, we will later introduce some special comments continue fields = TAB_RE.split(iline) if len(fields) > 2 and fields[2] == REGEXP: if fields[1] == POSITIVE: pos_regs.append(normalize_reg(fields[0])) elif fields[1] == NEGATIVE: neg_regs.append(normalize_reg(fields[0])) else: raise NotImplementedError( "Regular expressions are not supported" " for non-polar classes.") continue if fields[1] == POSITIVE: POS_SET.add(normalize(fields[0])) elif fields[1] == NEGATIVE: NEG_SET.add(normalize(fields[0])) elif fields[1] == NEUTRAL: NEUT_SET.add(normalize(fields[0])) else: raise RuntimeError("Unknown field specification: {:s}".format( fields[-1])) if pos_regs: POS_RE = join_regs(pos_regs) if neg_regs: NEG_RE = join_regs(neg_regs)
def _read_files_helper(a_crp_files, a_encoding=ENCODING): """Read corpus files and execute specified function. @param a_crp_files - files of the original corpus @param a_encoding - encoding of the vector file @return (Iterator over file lines) """ i = 0 tokens_seen = False for ifname in a_crp_files: with codecs.open(ifname, 'r', a_encoding) as ifile: for iline in ifile: iline = iline.strip().lower() if not iline or SENT_END_RE.match(iline): continue elif iline[0] == ESC_CHAR: if FASTMODE: i += 1 if i > 300: break if tokens_seen: tokens_seen = False yield None, None, None continue try: iform, itag, ilemma = TAB_RE.split(iline) except: print("Invalid line format at line: {:s}".format( repr(iline)), file=sys.stderr ) continue tokens_seen = True yield iform, itag, normalize(ilemma) yield None, None, None
def _read_files_helper(a_crp_files, a_encoding=ENCODING): """Read corpus files and execute specified function. @param a_crp_files - files of the original corpus @param a_encoding - encoding of the vector file @return (Iterator over file lines) """ i = 0 tokens_seen = False for ifname in a_crp_files: with codecs.open(ifname, 'r', a_encoding) as ifile: for iline in ifile: iline = iline.strip().lower() if not iline or SENT_END_RE.match(iline): continue elif iline[0] == ESC_CHAR: if FASTMODE: i += 1 if i > 300: break if tokens_seen: tokens_seen = False yield None, None, None continue try: iform, itag, ilemma = TAB_RE.split(iline) except: print("Invalid line format at line: {:s}".format( repr(iline)), file=sys.stderr) continue tokens_seen = True yield iform, itag, normalize(ilemma) yield None, None, None
def _read_files(a_stat, a_crp_files, a_pos, a_neg, a_neut, a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE): """Read corpus files and populate one-directional co-occurrences. @param a_stat - statistics on term occurrences @param a_crp_files - files of the original corpus @param a_pos - initial set of positive terms to be expanded @param a_neg - initial set of negative terms to be expanded @param a_neut - initial set of neutral terms to be expanded @param a_pos_re - regular expression for matching positive terms @param a_neg_re - regular expression for matching negative terms @return 2-tuple - number of positive and number of negative tweets @note modifies `a_stat' in place """ print("Reading corpus...", end="", file=sys.stderr) i = 0 iform = itag = ilemma = "" tlemmas = set() tweet_stat = [0, 0, 0] seeds = a_pos | a_neg | a_neut for ifname in a_crp_files: with codecs.open(ifname, 'r', ENCODING) as ifile: for iline in ifile: iline = iline.strip().lower() if iline and iline[0] == ESC_CHAR: if FASTMODE: i += 1 if i > 300: break _update_stat(a_stat, tweet_stat, tlemmas, a_pos, a_neg, a_neut, a_pos_re, a_neg_re) continue elif not iline or SENT_END_RE.match(iline): continue try: iform, itag, ilemma = TAB_RE.split(iline) except: print("Invalid line format at line: {:s}".format( repr(iline)), file=sys.stderr ) continue ilemma = normalize(ilemma) if a_pos_re.search(iform) or a_neg_re.search(iform) \ or iform in seeds: tlemmas.add(iform) elif a_pos_re.search(ilemma) or a_neg_re.search(ilemma) \ or ilemma in seeds: tlemmas.add(ilemma) elif itag[:2] not in INFORMATIVE_TAGS \ or not check_word(ilemma): continue else: tlemmas.add(ilemma) _update_stat(a_stat, tweet_stat, tlemmas, a_pos, a_neg, a_neut, a_pos_re, a_neg_re) print(" done", file=sys.stderr) # remove words with fewer occurrences than the minimum threshold _prune_stat(a_stat) return tweet_stat
def _read_files(a_crp_files, a_pos, a_neg, a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE): """Read corpus files and populate one-directional co-occurrences. @param a_crp_files - files of the original corpus @param a_pos - initial set of positive terms @param a_neg - initial set of negative terms @param a_pos_re - regular expression for matching positive terms @param a_neg_re - regular expression for matching negative terms @return (max_vecid, word2vecid, tok_stat) @note constructs statistics in place """ print("Reading corpus...", end="", file=sys.stderr) i = 0 prev_lemmas = [] tok_stat = Counter() word2cnt = Counter() iform = itag = ilemma = "" for ifname in a_crp_files: with codecs.open(ifname, 'r', ENCODING) as ifile: for iline in ifile: iline = iline.strip().lower() if not iline or SENT_END_RE.match(iline) \ or iline[0] == ESC_CHAR: if FASTMODE and prev_lemmas: i += 1 if i > 300: break if prev_lemmas: del prev_lemmas[:] continue try: iform, itag, ilemma = TAB_RE.split(iline) except: print("Invalid line format at line: {:s}".format( repr(iline)), file=sys.stderr) continue ilemma = normalize(ilemma) if a_pos_re.search(iform) or a_neg_re.search(iform) \ or a_pos_re.search(ilemma) or a_neg_re.search(ilemma): pass elif itag[:2] not in INFORMATIVE_TAGS \ or not check_word(ilemma): continue word2cnt[ilemma] += 1 for plemma in prev_lemmas: tok_stat[(plemma, ilemma)] += 1 while len(prev_lemmas) > TOK_WINDOW: prev_lemmas.pop(0) prev_lemmas.append(ilemma) del prev_lemmas[:] print(" done", file=sys.stderr) max_vecid = 0 word2vecid = {} # convert words to vector ids if their counters are big enough for w, cnt in word2cnt.iteritems(): if cnt >= MIN_TOK_CNT or w in a_pos or w in a_neg: word2vecid[w] = max_vecid max_vecid += 1 word2cnt.clear() # convert words to vector ids in context counter tok_stat = {(word2vecid[w1], word2vecid[w2]): cnt for (w1, w2), cnt in tok_stat.iteritems() if w1 in word2vecid and w2 in word2vecid and cnt >= MIN_TOK_CNT } return (max_vecid, word2vecid, tok_stat)
def _read_files(a_crp_files, a_pos, a_neg, a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE): """Read corpus files and populate one-directional co-occurrences. @param a_crp_files - files of the original corpus @param a_pos - initial set of positive terms @param a_neg - initial set of negative terms @param a_pos_re - regular expression for matching positive terms @param a_neg_re - regular expression for matching negative terms @return (max_vecid, word2vecid, tok_stat) @note constructs statistics in place """ print("Reading corpus...", end="", file=sys.stderr) i = 0 prev_lemmas = [] tok_stat = Counter() word2cnt = Counter() iform = itag = ilemma = "" for ifname in a_crp_files: with codecs.open(ifname, 'r', ENCODING) as ifile: for iline in ifile: iline = iline.strip().lower() if not iline or SENT_END_RE.match(iline) \ or iline[0] == ESC_CHAR: if FASTMODE and prev_lemmas: i += 1 if i > 300: break if prev_lemmas: del prev_lemmas[:] continue try: iform, itag, ilemma = TAB_RE.split(iline) except: print("Invalid line format at line: {:s}".format( repr(iline)), file=sys.stderr ) continue ilemma = normalize(ilemma) if a_pos_re.search(iform) or a_neg_re.search(iform) \ or a_pos_re.search(ilemma) or a_neg_re.search(ilemma): pass elif itag[:2] not in INFORMATIVE_TAGS \ or not check_word(ilemma): continue word2cnt[ilemma] += 1 for plemma in prev_lemmas: tok_stat[(plemma, ilemma)] += 1 while len(prev_lemmas) > TOK_WINDOW: prev_lemmas.pop(0) prev_lemmas.append(ilemma) del prev_lemmas[:] print(" done", file=sys.stderr) max_vecid = 0 word2vecid = {} # convert words to vector ids if their counters are big enough for w, cnt in word2cnt.iteritems(): if cnt >= MIN_TOK_CNT or w in a_pos or w in a_neg: word2vecid[w] = max_vecid max_vecid += 1 word2cnt.clear() # convert words to vector ids in context counter tok_stat = {(word2vecid[w1], word2vecid[w2]): cnt for (w1, w2), cnt in tok_stat.iteritems() if w1 in word2vecid and w2 in word2vecid and cnt >= MIN_TOK_CNT } return (max_vecid, word2vecid, tok_stat)
def _read_files(a_crp_files, a_pos, a_neg, a_neut, a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE): """Read corpus files and populate one-directional co-occurrences. @param a_crp_files - files of the original corpus @param a_pos - initial set of positive terms to be expanded @param a_neg - initial set of negative terms to be expanded @param a_neut - initial set of neutral terms to be expanded @param a_pos_re - regular expression for matching positive terms @param a_neg_re - regular expression for matching negative terms @return 2-tuple - training sets of features and their gold classes """ print("Reading corpus...", end="", file=sys.stderr) i = 0 ts_x = [] ts_y = [] tweet_toks = set() iform = itag = ilemma = prev_lemma = "" for ifname in a_crp_files: with codecs.open(ifname, 'r', ENCODING) as ifile: prev_lemma = "" for iline in ifile: iline = iline.strip().lower() if iline and iline[0] == ESC_CHAR: if FASTMODE: i += 1 if i > 300: break _update_ts(ts_x, ts_y, tweet_toks, a_pos, a_neg, a_neut, a_pos_re, a_neg_re) prev_lemma = "" continue elif not iline or SENT_END_RE.match(iline): prev_lemma = "" continue try: iform, itag, ilemma = TAB_RE.split(iline) except: print("Invalid line format at line: {:s}".format( repr(iline)), file=sys.stderr ) continue ilemma = normalize(ilemma) if a_pos_re.search(iform) or a_neg_re.search(iform): tweet_toks.add(iform) elif a_pos_re.search(ilemma) or a_neg_re.search(ilemma): tweet_toks.add(ilemma) elif itag[:2] not in INFORMATIVE_TAGS \ or not check_word(ilemma): continue else: tweet_toks.add(ilemma) if prev_lemma: tweet_toks.add((prev_lemma, ilemma)) prev_lemma = ilemma _update_ts(ts_x, ts_y, tweet_toks, a_pos, a_neg, a_neut, a_pos_re, a_neg_re) print(" done", file=sys.stderr) return _prune_ts(ts_x, ts_y)