def _get_form2lemma(a_fname): """Read file containing form/lemma correspodences @param a_fname - name of input file @return void (correspondences are read into global variables) """ global STOP_WORDS, FORM2LEMMA if not os.path.isfile(a_fname) or not os.access(a_fname, os.R_OK): raise RuntimeError("Cannot read from file '{:s}'".format( a_fname)) iform = itag = ilemma = "" with codecs.open(a_fname, 'r', encoding=ENCODING) as ifile: for iline in ifile: iline = iline.strip() if not iline: continue iform, itag, ilemma = TAB_RE.split(iline) iform = normalize(iform) if len(itag) > 1 and itag[:2] in INFORMATIVE_TAGS: FORM2LEMMA[iform] = normalize(ilemma) else: STOP_WORDS.add(iform)
def _tkm_add_corpus(ising, a_cc_file): """Add lexical nodes from corpus to the Ising spin model @param a_ising - instance of the Ising spin model @param a_cc_file - file containing conjoined word pairs extracted from corpus @return \c void """ ifields = [] iwght = 1. ilemma1 = ilemma2 = "" with codecs.open(a_cc_file, 'r', ENCODING) as ifile: for iline in ifile: iline = iline.strip() if not iline: continue ifields = TAB_RE.split(iline) if len(ifields) != 3: continue ilemma1, ilemma2, iwght = ifields if ilemma1 in FORM2LEMMA: ilemma1 = FORM2LEMMA[ilemma1] if ilemma2 in FORM2LEMMA: ilemma2 = FORM2LEMMA[ilemma2] if check_word(ilemma1) and check_word(ilemma2): ising.add_edge(normalize(ilemma1), normalize(ilemma2), float(iwght), a_add_missing=True)
def _get_form2lemma(a_fname): """Read file containing form/lemma correspodences @param a_fname - name of input file @return void (correspondences are read into global variables) """ global STOP_WORDS, FORM2LEMMA if not os.path.isfile(a_fname) or not os.access(a_fname, os.R_OK): raise RuntimeError("Cannot read from file '{:s}'".format(a_fname)) iform = itag = ilemma = "" with codecs.open(a_fname, 'r', encoding=ENCODING) as ifile: for iline in ifile: iline = iline.strip() if not iline: continue iform, itag, ilemma = TAB_RE.split(iline) iform = normalize(iform) if len(itag) > 1 and itag[:2] in INFORMATIVE_TAGS: FORM2LEMMA[iform] = normalize(ilemma) else: STOP_WORDS.add(iform)
def _output_cc_helper(a_cc_list): """ Extract sets of coordinatively and adversatively conjoined terms @param a_cc_list - list of conjoined phrases @return 2-tuple with sets of coordinatively and adversatively conjoined terms """ coord = set() advers = set() ret = (coord, advers) trg_coord = trg_advers = chld_coord = chld_advers = None chld_ret = None if type(a_cc_list) == list: if not a_cc_list: return ret a_cc_list[0] = normalize(a_cc_list[0]) if a_cc_list[0] in ADVERS_CC: trg_coord, trg_advers = advers, coord else: if a_cc_list[0] not in COORD_CC: if VERBOSE: print("WARNING: Unknown coordinative conjunction: '{:s}'".format(repr(a_cc_list[0])), \ file = sys.stderr) coord.add(a_cc_list[0]) trg_coord, trg_advers = coord, advers for chld in a_cc_list[1:]: chld_coord, chld_advers = _output_cc_helper(chld) trg_coord |= chld_coord trg_advers |= chld_advers else: coord.add(a_cc_list) return ret
def _process_cc_helper(a_tree, a_iroot, a_cc_main, a_cc_seen=False): """ Find coordinarively conjoined phrases in DG tree @param a_tree - DG tree to process @param a_index - root node of the tree @param a_cc_main - main list of coordinarively conjoined phrases to be populated @param a_cc_seen - particular list of coordinarively conjoined phrases coming from parent @return \c void """ if a_tree.is_empty(): return ret = [normalize(a_iroot.plemma)] # print("ret =", repr(ret), file = sys.stderr) # extract coordinarively conjoined chains for ichild in a_tree.children[a_iroot.idx]: # print("ichild.pdeprel =", repr(ichild.pdeprel), file = sys.stderr) if ichild.pdeprel in CC_RELATIONS: ret.append(_process_cc_helper(a_tree, ichild, a_cc_main, True)) else: a_cc_main += _process_cc_helper(a_tree, ichild, a_cc_main, False) if len(ret) != 1: if a_cc_seen: return ret else: a_cc_main.append(ret) return [] elif a_cc_seen: return ret[0] else: return []
def _crp2mtx(a_crp_files, a_pos, a_neg, a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE): """Construct sparse collocation matrix from raw corpus. @param a_crp_files - files of the original corpus @param a_pos - initial set of positive terms @param a_neg - initial set of negative terms @param a_pos_re - regular expression for matching positive terms @param a_neg_re - regular expression for matching negative terms @return (dict, mtx) - number of tokens, mapping from tokens to vector ids, and adjacency matrix """ # gather one-direction co-occurrence statistics max_vecid, word2vecid, tok_stat = _read_files(a_crp_files, a_pos, a_neg, a_pos_re, a_neg_re) for w in chain(a_pos, a_neg): w = normalize(w) if w not in word2vecid: word2vecid[w] = max_vecid max_vecid += 1 # convert cooccurrence statistics to a sparse matrix M = _tokstat2mtx(max_vecid, tok_stat) # iterate over the matrix and keep top 25 vectors with the highest cosine # similarity _prune_mtx(M) return (max_vecid, word2vecid, M.log1p())
def _output_cc_helper(a_cc_list): """ Extract sets of coordinatively and adversatively conjoined terms @param a_cc_list - list of conjoined phrases @return 2-tuple with sets of coordinatively and adversatively conjoined terms """ coord = set(); advers = set() ret = (coord, advers) trg_coord = trg_advers = chld_coord = chld_advers = None chld_ret = None if type(a_cc_list) == list: if not a_cc_list: return ret a_cc_list[0] = normalize(a_cc_list[0]) if a_cc_list[0] in ADVERS_CC: trg_coord, trg_advers = advers, coord else: if a_cc_list[0] not in COORD_CC: if VERBOSE: print("WARNING: Unknown coordinative conjunction: '{:s}'".format(repr(a_cc_list[0])), \ file = sys.stderr) coord.add(a_cc_list[0]) trg_coord, trg_advers = coord, advers for chld in a_cc_list[1:]: chld_coord, chld_advers = _output_cc_helper(chld) trg_coord |= chld_coord; trg_advers |= chld_advers else: coord.add(a_cc_list) return ret
def _process_cc_helper(a_tree, a_iroot, a_cc_main, a_cc_seen = False): """ Find coordinarively conjoined phrases in DG tree @param a_tree - DG tree to process @param a_index - root node of the tree @param a_cc_main - main list of coordinarively conjoined phrases to be populated @param a_cc_seen - particular list of coordinarively conjoined phrases coming from parent @return \c void """ if a_tree.is_empty(): return ret = [normalize(a_iroot.plemma)] # print("ret =", repr(ret), file = sys.stderr) # extract coordinarively conjoined chains for ichild in a_tree.children[a_iroot.idx]: # print("ichild.pdeprel =", repr(ichild.pdeprel), file = sys.stderr) if ichild.pdeprel in CC_RELATIONS: ret.append(_process_cc_helper(a_tree, ichild, a_cc_main, True)) else: a_cc_main += _process_cc_helper(a_tree, ichild, a_cc_main, False) if len(ret) != 1: if a_cc_seen: return ret else: a_cc_main.append(ret) return [] elif a_cc_seen: return ret[0] else: return []
def _read_set(a_fname): """Read initial seed set of terms. @param a_fname - name of input file containing terms @return void """ global POS_SET, NEG_SET, NEUT_SET, POS_RE, NEG_RE fields = [] pos_regs = [] neg_regs = [] with codecs.open(a_fname, 'r', encoding=ENCODING) as ifile: for iline in ifile: iline = iline.strip() if not iline: continue elif iline.startswith(COMMENT): # maybe, we will later introduce some special comments continue fields = TAB_RE.split(iline) if len(fields) > 2 and fields[2] == REGEXP: if fields[1] == POSITIVE: pos_regs.append(normalize_reg(fields[0])) elif fields[1] == NEGATIVE: neg_regs.append(normalize_reg(fields[0])) else: raise NotImplementedError( "Regular expressions are not supported" " for non-polar classes.") continue if fields[1] == POSITIVE: POS_SET.add(normalize(fields[0])) elif fields[1] == NEGATIVE: NEG_SET.add(normalize(fields[0])) elif fields[1] == NEUTRAL: NEUT_SET.add(normalize(fields[0])) else: raise RuntimeError( "Unknown field specification: {:s}".format(fields[-1])) if pos_regs: POS_RE = join_regs(pos_regs) if neg_regs: NEG_RE = join_regs(neg_regs)
def _read_set(a_fname): """Read initial seed set of terms. @param a_fname - name of input file containing terms @return void """ global POS_SET, NEG_SET, NEUT_SET, POS_RE, NEG_RE fields = [] pos_regs = [] neg_regs = [] with codecs.open(a_fname, 'r', encoding=ENCODING) as ifile: for iline in ifile: iline = iline.strip() if not iline: continue elif iline.startswith(COMMENT): # maybe, we will later introduce some special comments continue fields = TAB_RE.split(iline) if len(fields) > 2 and fields[2] == REGEXP: if fields[1] == POSITIVE: pos_regs.append(normalize_reg(fields[0])) elif fields[1] == NEGATIVE: neg_regs.append(normalize_reg(fields[0])) else: raise NotImplementedError( "Regular expressions are not supported" " for non-polar classes.") continue if fields[1] == POSITIVE: POS_SET.add(normalize(fields[0])) elif fields[1] == NEGATIVE: NEG_SET.add(normalize(fields[0])) elif fields[1] == NEUTRAL: NEUT_SET.add(normalize(fields[0])) else: raise RuntimeError("Unknown field specification: {:s}".format( fields[-1])) if pos_regs: POS_RE = join_regs(pos_regs) if neg_regs: NEG_RE = join_regs(neg_regs)
def lemmatize(a_form, a_prune=True): """ Convert word form to its lemma @param a_form - word form for which we should obtain lemma @param a_prune - flag indicating whether uninformative words should be pruned @return lemma of the word """ a_form = normalize(a_form) if a_prune and a_form in STOP_WORDS: return None if a_form in FORM2LEMMA: return FORM2LEMMA[a_form] return a_form
def _read_files_helper(a_crp_files, a_encoding=ENCODING): """Read corpus files and execute specified function. @param a_crp_files - files of the original corpus @param a_encoding - encoding of the vector file @return (Iterator over file lines) """ i = 0 tokens_seen = False for ifname in a_crp_files: with codecs.open(ifname, 'r', a_encoding) as ifile: for iline in ifile: iline = iline.strip().lower() if not iline or SENT_END_RE.match(iline): continue elif iline[0] == ESC_CHAR: if FASTMODE: i += 1 if i > 300: break if tokens_seen: tokens_seen = False yield None, None, None continue try: iform, itag, ilemma = TAB_RE.split(iline) except: print("Invalid line format at line: {:s}".format( repr(iline)), file=sys.stderr ) continue tokens_seen = True yield iform, itag, normalize(ilemma) yield None, None, None
def _read_files_helper(a_crp_files, a_encoding=ENCODING): """Read corpus files and execute specified function. @param a_crp_files - files of the original corpus @param a_encoding - encoding of the vector file @return (Iterator over file lines) """ i = 0 tokens_seen = False for ifname in a_crp_files: with codecs.open(ifname, 'r', a_encoding) as ifile: for iline in ifile: iline = iline.strip().lower() if not iline or SENT_END_RE.match(iline): continue elif iline[0] == ESC_CHAR: if FASTMODE: i += 1 if i > 300: break if tokens_seen: tokens_seen = False yield None, None, None continue try: iform, itag, ilemma = TAB_RE.split(iline) except: print("Invalid line format at line: {:s}".format( repr(iline)), file=sys.stderr) continue tokens_seen = True yield iform, itag, normalize(ilemma) yield None, None, None
def tang(a_N, a_emb_fname, a_pos, a_neg, a_neut, a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE, a_encoding=ENCODING): """Method for generating sentiment lexicons using Velikovich's approach. @param a_N - number of terms to extract @param a_emb_fname - files of the original corpus @param a_pos - initial set of positive terms to be expanded @param a_neg - initial set of negative terms to be expanded @param a_neut - initial set of neutral terms to be expanded @param a_pos_re - regular expression for matching positive terms @param a_neg_re - regular expression for matching negative terms @param a_neg_re - regular expression for matching negative terms @param a_encoding - encoding of the vector file @return list of terms sorted according to their polarities """ w2i, EMBS, ndim = read_embeddings(a_emb_fname, a_encoding) X, Y = digitize_trainset(w2i, a_pos, a_neg, a_neut, a_pos_re, a_neg_re) train, validate, predict, params = init_nnet(EMBS, len(set(Y)), ndim) best_params = [] best_acc = acc = -1 N = len(Y) train_idcs, devtest_idcs = train_test_split(np.arange(N), test_size=0.1) devtest_N = float(len(devtest_idcs)) devtest_X = X[devtest_idcs] devtest_Y = Y[devtest_idcs] # train epoch_i = 0 prev_cost = 0 while epoch_i < MAX_EPOCHS: np.random.shuffle(train_idcs) cost = 0. start_time = datetime.utcnow() for idx in train_idcs: x_i, y_i = X[idx], Y[idx] cost += train(x_i, y_i) acc = 0. for x_i, y_i in zip(devtest_X, devtest_Y): acc += validate(x_i, y_i) acc /= devtest_N if acc >= best_acc: best_params = [p.get_value() for p in params] best_acc = acc sfx = " *" else: sfx = '' end_time = datetime.utcnow() tdelta = (end_time - start_time).total_seconds() print("Iteration #{:d} ({:.2f} sec): cost = {:.2f}, " "accuracy = {:.2%};{:s}".format(epoch_i, tdelta, cost, acc, sfx), file=sys.stderr) if abs(prev_cost - cost) < EPSILON and epoch_i > MIN_EPOCHS: break else: prev_cost = cost epoch_i += 1 if best_params: for p, val in zip(params, best_params): p.set_value(val) # apply trained classifier to unseen data ret = [] for w, w_idx in w2i.iteritems(): if normalize(w) in a_pos or a_pos_re.match(w): pol_cls = POSITIVE_LBL pol_score = FMAX elif normalize(w) in a_neg or a_neg_re.match(w): pol_cls = NEGATIVE_LBL pol_score = FMIN else: pol_idx, pol_score = predict(w_idx) pol_score = pol_score.item(0) if pol_idx == POSITIVE_IDX: pol_cls = POSITIVE_LBL elif pol_idx == NEGATIVE_IDX: pol_cls = NEGATIVE_LBL else: continue ret.append((w, pol_cls, pol_score)) ret.sort(key=lambda el: abs(el[-1]), reverse=True) return ret
def check_in_seeds(a_form, a_lemma, a_seeds, a_seed_re): if a_seed_re.search(a_form) or a_seed_re.search(a_lemma) \ or a_form in a_seeds or normalize(a_form) in a_seeds \ or a_lemma in a_seeds: return True return False
import argparse import codecs import re import string import sys ################################################################## # Variables and Constants VERBOSE = False PUNCT_RE = re.compile(r"^(?:" + '|'.join([re.escape(c) for c in string.punctuation]) + \ ")+$") ESC_CHAR = "" ENCODING = "utf-8" CC_RELATIONS = set(["CD", "CJ"]) ADVERS_CC = set([normalize(w) for w in ["aber"]]) COORD_CC = set([normalize(w) for w in ["und", "oder", ","]]) ################################################################## # Methods def _find_roots(a_tree): """ Find roots of DG tree @param a_tree - DG tree to process @return list of root indices """ for inode in a_tree: if inode.phead == '0': yield inode
def add_seeds(seeds, label): for iterm in seeds: iterm = normalize(iterm) if iterm in w2i: X.append(w2i[iterm]) Y.append(label)
import argparse import codecs import re import string import sys ################################################################## # Variables and Constants VERBOSE = False PUNCT_RE = re.compile(r"^(?:" + '|'.join([re.escape(c) for c in string.punctuation]) + \ ")+$") ESC_CHAR = "" ENCODING = "utf-8" CC_RELATIONS = set(["CD", "CJ"]) ADVERS_CC = set([normalize(w) for w in ["aber"]]) COORD_CC = set([normalize(w) for w in ["und", "oder", ","]]) ################################################################## # Methods def _find_roots(a_tree): """ Find roots of DG tree @param a_tree - DG tree to process @return list of root indices """ for inode in a_tree: if inode.phead == '0':
def _read_files(a_crp_files, a_pos, a_neg, a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE): """Read corpus files and populate one-directional co-occurrences. @param a_crp_files - files of the original corpus @param a_pos - initial set of positive terms @param a_neg - initial set of negative terms @param a_pos_re - regular expression for matching positive terms @param a_neg_re - regular expression for matching negative terms @return (max_vecid, word2vecid, tok_stat) @note constructs statistics in place """ print("Reading corpus...", end="", file=sys.stderr) i = 0 prev_lemmas = [] tok_stat = Counter() word2cnt = Counter() iform = itag = ilemma = "" for ifname in a_crp_files: with codecs.open(ifname, 'r', ENCODING) as ifile: for iline in ifile: iline = iline.strip().lower() if not iline or SENT_END_RE.match(iline) \ or iline[0] == ESC_CHAR: if FASTMODE and prev_lemmas: i += 1 if i > 300: break if prev_lemmas: del prev_lemmas[:] continue try: iform, itag, ilemma = TAB_RE.split(iline) except: print("Invalid line format at line: {:s}".format( repr(iline)), file=sys.stderr) continue ilemma = normalize(ilemma) if a_pos_re.search(iform) or a_neg_re.search(iform) \ or a_pos_re.search(ilemma) or a_neg_re.search(ilemma): pass elif itag[:2] not in INFORMATIVE_TAGS \ or not check_word(ilemma): continue word2cnt[ilemma] += 1 for plemma in prev_lemmas: tok_stat[(plemma, ilemma)] += 1 while len(prev_lemmas) > TOK_WINDOW: prev_lemmas.pop(0) prev_lemmas.append(ilemma) del prev_lemmas[:] print(" done", file=sys.stderr) max_vecid = 0 word2vecid = {} # convert words to vector ids if their counters are big enough for w, cnt in word2cnt.iteritems(): if cnt >= MIN_TOK_CNT or w in a_pos or w in a_neg: word2vecid[w] = max_vecid max_vecid += 1 word2cnt.clear() # convert words to vector ids in context counter tok_stat = {(word2vecid[w1], word2vecid[w2]): cnt for (w1, w2), cnt in tok_stat.iteritems() if w1 in word2vecid and w2 in word2vecid and cnt >= MIN_TOK_CNT } return (max_vecid, word2vecid, tok_stat)
def _read_files(a_crp_files, a_pos, a_neg, a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE): """Read corpus files and populate one-directional co-occurrences. @param a_crp_files - files of the original corpus @param a_pos - initial set of positive terms @param a_neg - initial set of negative terms @param a_pos_re - regular expression for matching positive terms @param a_neg_re - regular expression for matching negative terms @return (max_vecid, word2vecid, tok_stat) @note constructs statistics in place """ print("Reading corpus...", end="", file=sys.stderr) i = 0 prev_lemmas = [] tok_stat = Counter() word2cnt = Counter() iform = itag = ilemma = "" for ifname in a_crp_files: with codecs.open(ifname, 'r', ENCODING) as ifile: for iline in ifile: iline = iline.strip().lower() if not iline or SENT_END_RE.match(iline) \ or iline[0] == ESC_CHAR: if FASTMODE and prev_lemmas: i += 1 if i > 300: break if prev_lemmas: del prev_lemmas[:] continue try: iform, itag, ilemma = TAB_RE.split(iline) except: print("Invalid line format at line: {:s}".format( repr(iline)), file=sys.stderr ) continue ilemma = normalize(ilemma) if a_pos_re.search(iform) or a_neg_re.search(iform) \ or a_pos_re.search(ilemma) or a_neg_re.search(ilemma): pass elif itag[:2] not in INFORMATIVE_TAGS \ or not check_word(ilemma): continue word2cnt[ilemma] += 1 for plemma in prev_lemmas: tok_stat[(plemma, ilemma)] += 1 while len(prev_lemmas) > TOK_WINDOW: prev_lemmas.pop(0) prev_lemmas.append(ilemma) del prev_lemmas[:] print(" done", file=sys.stderr) max_vecid = 0 word2vecid = {} # convert words to vector ids if their counters are big enough for w, cnt in word2cnt.iteritems(): if cnt >= MIN_TOK_CNT or w in a_pos or w in a_neg: word2vecid[w] = max_vecid max_vecid += 1 word2cnt.clear() # convert words to vector ids in context counter tok_stat = {(word2vecid[w1], word2vecid[w2]): cnt for (w1, w2), cnt in tok_stat.iteritems() if w1 in word2vecid and w2 in word2vecid and cnt >= MIN_TOK_CNT } return (max_vecid, word2vecid, tok_stat)