def __init__(self, company_id, release_ids, article_ids, output_name): self._company_id = company_id self._release_ids = release_ids self._article_ids = article_ids self._output_name = output_name self._tokens = TokenLoader(company_id) self._make_dirs()
def write_tags(self, matches_name, company_id): dic_rel = {} dic_art = {} matches = MatchLoader(company_id, matches_name) tokens = TokenLoader(company_id) rel_ids = matches.get_release_ids() for count, release_id in enumerate(rel_ids): print 'processing release #{0} of {1}'.format(count+1, len(rel_ids)) tmp = tokens.get_release_tokens(release_id, False) self._process_tokens(tmp, dic_rel, release_id) art_ids = matches.get_article_ids() for count, article_id in enumerate(art_ids): print 'processing article #{0} of {1}'.format(count+1, len(art_ids)) tmp = tokens.get_article_tokens(article_id, False) self._process_tokens(tmp, dic_art, article_id) path1 = common.get_postags_path() path2 = os.path.join(path1, matches_name) path = os.path.join(path2, common.DOCTYPE_PR) self._pickle(company_id, dic_rel, path) path = os.path.join(path2, common.DOCTYPE_NEWS) self._pickle(company_id, dic_art, path)
def __init__(self, company_id, matches_name): self._company_id = company_id self._matchloader = MatchLoader(company_id, matches_name) self._tokens = TokenLoader(company_id) self._releases = ReleaseLoader(company_id).get_releases() self._articles = ArticleLoader(company_id).get_articles() self._br = ConfigReader().get('MARKER_BR')
def _process(self, company_id): matchloader = MatchLoader(company_id, MATCHES_NAME) tokens = TokenLoader(company_id) releases = ReleaseLoader(company_id).get_releases() articles = ArticleLoader(company_id).get_articles() scores = ScoreLoader(company_id) for release_id in matcheloader.get_release_ids(): release = releases[release_id] rel_tokens = tokens.get_stripped_release_token_block(release_id, 0, sys.maxint)
def print_matrix(self): sb = [] sb.append('co-id, rel-id, art-id, rel-len, art-len, rel-used, art-added, rel-subj-score, art-subj-score, rel-sent-score, art-sent-score\n') # sb.append('co-id rel-id art-id rel-len art-len rel-used art-added rel-subj-score art-subj-score rel-sent-score art-sent-score\n') for company_id in range(1, 41): matches = MatchLoader(company_id, self._match_name) tokens = TokenLoader(company_id) scores = ScoreLoader(company_id) for release_id in matches.get_release_ids(): rel_tokens = tokens.get_stripped_release_token_block(release_id, 0, sys.maxint) #release subjectivity score rel_subj = scores.count_subj_rel_sentences(release_id) / scores.count_all_rel_sentences(release_id) #release sentiment score if scores.count_subj_rel_sentences(release_id) == 0: rel_sents = 0 else: pos_minus_neg = scores.count_pos_rel_sentences(release_id) - scores.count_neg_rel_sentences(release_id) rel_sent = pos_minus_neg / scores.count_subj_rel_sentences(release_id) for article_id in matches.get_article_ids(release_id): art_tokens = tokens.get_stripped_article_token_block(article_id, 0, sys.maxint) blocks = matches.get_matches(release_id, article_id) blocklen = 0 for b in blocks: start = b[1] length = b[2] end = start + length block_tokens = tokens.get_stripped_release_token_block(release_id, start, end) blocklen += len(block_tokens) rel_used = blocklen/len(rel_tokens) art_added = 1 - blocklen/len(art_tokens) #article subjectivity score art_subj = scores.count_subj_art_sentences(article_id) / scores.count_all_art_sentences(article_id) #article sentiment score if scores.count_subj_art_sentences(article_id) == 0: art_sents = 0 else: pos_minus_neg = scores.count_pos_art_sentences(article_id) - scores.count_neg_art_sentences(article_id) art_sent = pos_minus_neg / scores.count_subj_art_sentences(article_id) sb.append('{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}\n'.format( \ # sb.append('{0} {1} {2} {3} {4} {5} {6} {7} {8} {9} {10}\n'.format( \ company_id, release_id, article_id, len(rel_tokens), len(art_tokens), rel_used, art_added, rel_subj, art_subj, rel_sent, art_sent)) text = ''.join(sb) print text
def make_subset(self, matches_name_in, required_length, min_length, subset_name_out): subset_maker = SubsetMaker() pair_counter = 0 for company_id in range(1, 41): print 'Processing company {0}'.format(company_id) tokens = TokenLoader(company_id) matches = MatchLoader(company_id, matches_name_in) for release_id in matches.get_release_ids(): for article_id in matches.get_article_ids(release_id): blocks = matches.get_matches(release_id, article_id) blocklist = bfilter.get_blocks(blocks, release_id, article_id) if len( blocklist ) > 0: #if there are valid blocks according to this criteria subset_maker pr_set.add(release_id) news_set.add(article_id) pair_counter += 1 print 'Total pairs: {0}'.format(pair_counter) subset_maker.save(subset_name_out)
class TextWriter(object): def __init__(self, company_id, release_ids, article_ids, output_name): self._company_id = company_id self._release_ids = release_ids self._article_ids = article_ids self._output_name = output_name self._tokens = TokenLoader(company_id) self._make_dirs() def write(self): br = ConfigReader().get('MARKER_BR') for release_id in self._release_ids: tokens = self._tokens.get_release_tokens(release_id, False) text = ' '.join(tokens) text = text.replace(br, '\n') path = self._get_filepath(common.DOCTYPE_PR, release_id) with open(path, 'w') as f: f.write(text) for article_id in self._article_ids: tokens = self._tokens.get_article_tokens(article_id, False) text = ' '.join(tokens) text = text.replace(br, '\n') path = self._get_filepath(common.DOCTYPE_NEWS, article_id) with open(path, 'w') as f: f.write(text) def _get_filepath(self, doctype, text_id): path_dir = common.get_text_path(self._output_name, self._company_id) path_subdir = os.path.join(path_dir, doctype) return os.path.join(path_subdir, str(text_id)) def _make_dirs(self): path = common.get_text_path(self._output_name, self._company_id) if not os.path.exists(path): os.mkdir(path) rel_path = os.path.join(path, common.DOCTYPE_PR) if not os.path.exists(rel_path): os.mkdir(rel_path) art_path = os.path.join(path, common.DOCTYPE_NEWS) if not os.path.exists(art_path): os.mkdir(art_path)
def __init__(self, company_id, release_ids, article_ids, required_length, min_length, blocks_name_toignore): self._company_id = company_id self._release_ids = release_ids self._article_ids = article_ids self._required_length = required_length self._min_length = min_length self._tokens = TokenLoader(company_id) self._releases = ReleaseLoader(company_id).get_releases() self._articles = ArticleLoader(company_id).get_articles() self._ignoreblocks = BlockLoader(company_id, blocks_name_toignore).get_blocks() self._count_ignore = 0 dloader = DuplicateLoader(company_id) self._rel_duplicates = dloader.get_release_duplicates() self._art_duplicates = dloader.get_article_duplicates()
class MatchFilter(object): def __init__(self, company_id, match_name_in, match_name_out): self._company_id = company_id self._match_name_in = match_name_in self._match_name_out = match_name_out self._tokens = TokenLoader(company_id) def filter_exclude_pairs(self, pairs_name): matches = MatchLoader(self._company_id, self._match_name_in) maker = MatchMaker(self._company_id, self._match_name_out) pairs = PairLoader(self._company_id, pairs_name) for release_id in matches.get_release_ids(): for article_id in matches.get_article_ids(release_id): if not pairs.has_pair(release_id, article_id): blocks = matches.get_matches(release_id, article_id) maker.add_blocks(release_id, article_id, blocks) maker.save() def filter_by_min_len(self, min_len): matches = MatchLoader(self._company_id, self._match_name_in) maker = MatchMaker(self._company_id, self._match_name_out) for release_id in matches.get_release_ids(): for article_id in matches.get_article_ids(release_id): blocks = matches.get_matches(release_id, article_id) newblocks = [] for b in blocks: start = b[1] #release start length = b[2] end = start + length tkns = self._tokens.get_stripped_release_token_block( release_id, start, end) if len(tkns) >= min_len: newblocks.append(b) if len(newblocks) > 0: maker.add_blocks(release_id, article_id, newblocks) maker.save()
def test_tokenloader(): company_id = int(sys.argv[1]) t = TokenLoader(company_id) r = ReleaseLoader(company_id) a = ArticleLoader(company_id) print 'Testing TokenLoader' print 'company-id: {0}'.format(company_id) articles = a.get_articles() article_id = articles.itervalues().next().id() print 'article-id: {0}'.format(article_id) print 'TOKENS:' print t.get_article_tokens(article_id, False) releases = r.get_releases() release_id = releases.itervalues().next().id() print 'release-id: {0}'.format(release_id) print 'TOKENS:' print t.get_release_tokens(release_id, False)
def __init__(self, company_id, match_name_in, match_name_out): self._company_id = company_id self._match_name_in = match_name_in self._match_name_out = match_name_out self._tokens = TokenLoader(company_id)
def __init__(self, company_id, matches_name): self._matchloader = MatchLoader(company_id, matches_name) self._tokens = TokenLoader(company_id) self._br = ConfigReader().get('MARKER_BR')
class BlockFinder(object): def __init__(self, company_id, matches_name): self._matchloader = MatchLoader(company_id, matches_name) self._tokens = TokenLoader(company_id) self._br = ConfigReader().get('MARKER_BR') def print_all_matching_blocks(self, min_len, max_len): for release_id in self._matchloader.get_release_ids(): for article_id in self._matchloader.get_article_ids(release_id): blocks = self._matchloader.get_matches(release_id, article_id) for block in blocks: i = block[0] j = block[1] k = block[2] rel_match = self._tokens.get_stripped_release_token_block( release_id, j, j + k) if len(rel_match) >= min_len and len(rel_match) < max_len: mb = ' '.join(rel_match) mb = mb.replace(self._br, ' ') print mb #prints blocks of min_length or larger occuring in more than one release - # i.e., bad discriminators between releases def print_all_nondiscrim_release_blocks(self, min_len, max_len): blockset_dict = {} for release_id in self._matchloader.get_release_ids(): blockset = set() #set of blocks for current release blockset_dict[release_id] = blockset for article_id in self._matchloader.get_article_ids(release_id): blocks = self._matchloader.get_matches(release_id, article_id) for block in blocks: i = block[0] j = block[1] k = block[2] rel_match = self._tokens.get_stripped_release_token_block( release_id, j, j + k) if len(rel_match) >= min_len and len(rel_match) < max_len: mb = ' '.join(rel_match) mb = mb.replace(self._br, ' ') mb = mb.lower().strip() blockset.add(mb) #count occurances of each block per release bcounts = {} for release_id in blockset_dict: blockset = blockset_dict[release_id] for b in blockset: if b in bcounts: bcounts[b] += 1 else: bcounts[b] = 1 #print blocks which occur more than once per release result = [key for key in bcounts if bcounts[key] > 1] for r in result: print r
class MatchFinder(object): def __init__(self, company_id, release_ids, article_ids, required_length, min_length, blocks_name_toignore): self._company_id = company_id self._release_ids = release_ids self._article_ids = article_ids self._required_length = required_length self._min_length = min_length self._tokens = TokenLoader(company_id) self._releases = ReleaseLoader(company_id).get_releases() self._articles = ArticleLoader(company_id).get_articles() self._ignoreblocks = BlockLoader(company_id, blocks_name_toignore).get_blocks() self._count_ignore = 0 dloader = DuplicateLoader(company_id) self._rel_duplicates = dloader.get_release_duplicates() self._art_duplicates = dloader.get_article_duplicates() def find_matches(self, output_name): matchmaker = MatchMaker(self._company_id, output_name) matcher = SequenceMatcher(autojunk=False) message = 'Processing company {0}: release {1} of {2}; article {3} of {4}' pairs_counter = 0 for i, release_id in enumerate( self._release_ids): #loop through releases if release_id in self._rel_duplicates: continue matcher.set_seq2(self._tokens.get_release_tokens(release_id, True)) release_date = self._releases[release_id].date() for j, article_id in enumerate( self._article_ids): #loop through articles if article_id in self._art_duplicates: continue if j % 100 == 0: print message.format(self._company_id, i + 1, len(self._release_ids), j + 1, len(self._article_ids)) matcher.set_seq1( self._tokens.get_article_tokens(article_id, True)) article_date = self._articles[article_id].date() if article_date >= release_date: #search for matches if article appeared after the release blocks = matcher.get_matching_blocks( ) #block form: (i,j,k) where i = article (seq1), j = release (seq2) if len(blocks) > 0: #if there are blocks valid_blocks = self._get_blocks( blocks, release_id, article_id) if len(valid_blocks) > 0: #if there are valid blocks matchmaker.add_blocks(release_id, article_id, valid_blocks) print '\tfound match for release={0} and article={1}'.format( release_id, article_id) pairs_counter += 1 print 'total matching pairs: {0}'.format(pairs_counter) print 'ignored bad discriminators: {0}'.format(self._count_ignore) matchmaker.save() def _get_blocks(self, blocks, release_id, article_id): blocklist = [] required_length_check = False for b in blocks: i = b[0] j = b[1] k = b[2] rel_match = self._tokens.get_stripped_release_token_block( release_id, j, j + k) art_match = self._tokens.get_stripped_article_token_block( article_id, i, i + k) rel_temp = ' '.join(rel_match) art_temp = ' '.join(art_match) if rel_temp.lower() != art_temp.lower(): print rel_temp.lower() print art_temp.lower() raise Exception("blocks don't match") #check against bad discriminators BEFORE updating required_length_check if rel_temp.lower() in self._ignoreblocks: self._count_ignore += 1 continue #check for min_length BEFORE updating required_length_check if len(rel_match) < self._min_length: continue if len(rel_match) >= self._required_length: required_length_check = True blocklist.append(b) #sort by length, decending if len(blocklist) == 0: return [] if not required_length_check: return [] else: blocklist = sorted(blocklist, key=itemgetter(2), reverse=True) return blocklist
class MatchWriter(object): def __init__(self, company_id, matches_name): self._company_id = company_id self._matchloader = MatchLoader(company_id, matches_name) self._tokens = TokenLoader(company_id) self._releases = ReleaseLoader(company_id).get_releases() self._articles = ArticleLoader(company_id).get_articles() self._br = ConfigReader().get('MARKER_BR') def write_matches(self, output_path): html = self._build_html() filename = '{0}.html'.format(self._company_id) filepath = os.path.join(output_path, filename) self._write_html_to_file(filepath, html) def _build_html(self): sb = [] counter = 0 releases = self._get_sorted_releases() for release in releases: self._write_release_header(sb, release) articles = self._get_sorted_articles(release.id()) for article in articles: #condition for id=35/32 only if self._company_id == '35': delta = article.date() - release.date() if delta.days >= TIME_DELTA and \ not (release.id() == 246 and article.id() == 944) and \ not (release.id() == 189 and article.id() == 1213) and \ not (release.id() == 71 and article.id() == 2557): continue if self._company_id == '32': delta = article.date() - release.date() if delta.days >= TIME_DELTA: continue blocks = self._matchloader.get_matches(release.id(), article.id()) self._write_article_summary(sb, blocks, release, article) self._write_texts(sb, blocks, release.id(), article.id()) counter += 1 print '{0}'.format(counter) return ''.join(sb) def _get_sorted_releases(self): ids = self._matchloader.get_release_ids() rels = [self._releases[id] for id in ids] rels.sort(key=lambda x: x.date()) return rels def _get_sorted_articles(self, release_id): ids = self._matchloader.get_article_ids(release_id) arts = [self._articles[id] for id in ids] arts.sort(key=lambda x: x.date()) return arts def _write_release_header(self, sb, release): sb.append('\n\t<tr>\n\t\t<td colspan="2" class="release-title">') sb.append('{0} --- {1} --- {2}\n\t\t</td>\n\t</tr>'.format( \ release.id(), release.date().strftime('%B %d'), release.title())) def _write_article_summary(self, sb, blocks, release, article): sb.append('\n\t<tr><td colspan=2>') sb.append( '\n\t\t<table class="tbl-inner1" cellpadding="5" border="1"i>') sb.append( '\n\t\t\t<tr class="tbl-inner1-title"><td colspan="3" class="article-title">' ) sb.append('R: {0} --- {1} --- {2}\n\t\t</td>\n\t</tr>'.format( \ release.id(), release.date().strftime('%B %d'), release.title())) sb.append( '\n\t\t\t<tr class="tbl-inner1-title"><td colspan="3" class="article-title">' ) sb.append('A: {0} --- {1} --- {2} --- {3}\n\t\t</td>\n\t</tr>'.format( \ article.id(), article.date().strftime('%B %d'), article.headline(), article.pub())) sb.append( '\n\t\t\t<tr class="tbl-inner1-title"><td>#</td><td>length</td><td>match</td></tr>' ) for count, block in enumerate(blocks): i = block[0] #start in article j = block[1] #start in release k = block[2] #length rel_match = self._tokens.get_stripped_release_token_block( release.id(), j, j + k) art_match = self._tokens.get_stripped_article_token_block( article.id(), i, i + k) rel_temp = ' '.join(rel_match) art_temp = ' '.join(art_match) rel_temp = rel_temp.replace(self._br, ' ') art_temp = art_temp.replace(self._br, ' ') if rel_temp.lower() != art_temp.lower(): print rel_temp.lower() print art_temp.lower() raise Exception("blocks don't match") sb.append('\n\t\t\t<tr valign="top">') sb.append('\n\t\t\t\t<td>{0}</td>'.format(count + 1)) sb.append('\n\t\t\t\t<td>{0}</td>'.format(k)) sb.append( '\n\t\t\t\t<td><span class="match match{0}">{1}</span>\n\t\t</td>' .format(count, rel_temp)) sb.append('\n\t\t\t</tr>') sb.append('\n\t\t</table>') sb.append('\n\t</td></tr>') def _write_texts(self, sb, blocks, release_id, article_id): rel_tokens = self._tokens.get_release_tokens(release_id, False) art_tokens = self._tokens.get_article_tokens(article_id, False) rel_html = self._get_text(blocks, rel_tokens, POS_IN_BLOCK_REL) art_html = self._get_text(blocks, art_tokens, POS_IN_BLOCK_ART) sb.append('\n\t<tr valign="top">') sb.append('\n\t\t<td width="50%">{0}\n\t\t</td>'.format(rel_html)) sb.append('\n\t\t<td width="50%">{0}\n\t\t</td>'.format(art_html)) sb.append('\n\t</tr>') def _get_text(self, blocks, orig_tokens, pos_in_block): span_start = '<span class="match match{0}">' span_end = '</span>' #clone list tokens = orig_tokens[:] #sort by position in article blocks = sorted(blocks, key=itemgetter(pos_in_block)) a = 0 for count, block in enumerate(blocks): pos = block[pos_in_block] #position in text k = block[2] #length tokens.insert(pos + a, span_start.format(count)) tokens.insert(pos + k + a + 1, span_end) a += 2 html = ' '.join(tokens) html = html.replace(self._br, '<br/>') return html def _write_html_to_file(self, output_path, html): with open(output_path, 'w') as f: f.write('<html>\n<head>') f.write( '\n\t<link rel="stylesheet" type="text/css" href="styles.css">' ) f.write('\n</head>\n<body>\n') f.write('\n<table class="tbl-main" cellpadding="5" border="1">') f.write(html) f.write('\n</table>') f.write('\n\n</body>\n</html>')