Ejemplo n.º 1
0
    def write_tags(self, matches_name, company_id):   
        dic_rel = {}
        dic_art = {}

        matches = MatchLoader(company_id, matches_name)
        tokens = TokenLoader(company_id)       

        rel_ids = matches.get_release_ids()
        for count, release_id in enumerate(rel_ids):
            print 'processing release #{0} of {1}'.format(count+1, len(rel_ids))
            tmp = tokens.get_release_tokens(release_id, False)
            self._process_tokens(tmp, dic_rel, release_id)

        art_ids = matches.get_article_ids()
        for count, article_id in enumerate(art_ids):
            print 'processing article #{0} of {1}'.format(count+1, len(art_ids))
            tmp = tokens.get_article_tokens(article_id, False)
            self._process_tokens(tmp, dic_art, article_id)

        path1 = common.get_postags_path()
        path2 = os.path.join(path1, matches_name)

        path = os.path.join(path2, common.DOCTYPE_PR)
        self._pickle(company_id, dic_rel, path)

        path = os.path.join(path2, common.DOCTYPE_NEWS)
        self._pickle(company_id, dic_art, path)
Ejemplo n.º 2
0
 def __init__(self, company_id, matches_name):
     self._company_id = company_id
     self._matchloader = MatchLoader(company_id, matches_name)
     self._tokens = TokenLoader(company_id)
     self._releases = ReleaseLoader(company_id).get_releases()
     self._articles = ArticleLoader(company_id).get_articles()
     self._br = ConfigReader().get('MARKER_BR')
Ejemplo n.º 3
0
def test_matchloader_forall():
    matches_name = sys.argv[1]

    for id in range(1, 41):
        m = MatchLoader(id, matches_name) 
#        print '{0}'.format(len(m.get_release_ids()))
#        print '{0}'.format(len(m.get_article_ids()))
#        print '{0}'.format(m.count_rel_art_pairs())
        print '{0}'.format(m.count_matching_blocks())
Ejemplo n.º 4
0
    def filter_exclude_pairs(self, pairs_name):
        matches = MatchLoader(self._company_id, self._match_name_in)
        maker = MatchMaker(self._company_id, self._match_name_out)
        pairs = PairLoader(self._company_id, pairs_name)

        for release_id in matches.get_release_ids():
            for article_id in matches.get_article_ids(release_id):
                if not pairs.has_pair(release_id, article_id):
                    blocks = matches.get_matches(release_id, article_id)
                    maker.add_blocks(release_id, article_id, blocks)
        maker.save()
Ejemplo n.º 5
0
    def print_matrix(self):
        sb = []
        sb.append('co-id, rel-id, art-id, rel-len, art-len, rel-used, art-added, rel-subj-score, art-subj-score, rel-sent-score, art-sent-score\n')
#        sb.append('co-id rel-id art-id rel-len art-len rel-used art-added rel-subj-score art-subj-score rel-sent-score art-sent-score\n')

        for company_id in range(1, 41):
            matches = MatchLoader(company_id, self._match_name)
            tokens = TokenLoader(company_id)
            scores = ScoreLoader(company_id)

            for release_id in matches.get_release_ids():
                rel_tokens = tokens.get_stripped_release_token_block(release_id, 0, sys.maxint)

                #release subjectivity score
                rel_subj = scores.count_subj_rel_sentences(release_id) / scores.count_all_rel_sentences(release_id)
                #release sentiment score
                if scores.count_subj_rel_sentences(release_id) == 0:
                    rel_sents = 0
                else:
                    pos_minus_neg = scores.count_pos_rel_sentences(release_id) - scores.count_neg_rel_sentences(release_id)
                    rel_sent = pos_minus_neg / scores.count_subj_rel_sentences(release_id)

                for article_id in matches.get_article_ids(release_id):
                    art_tokens = tokens.get_stripped_article_token_block(article_id, 0, sys.maxint)

                    blocks = matches.get_matches(release_id, article_id)
                    blocklen = 0
                    for b in blocks:
                        start = b[1]
                        length = b[2]
                        end = start + length
                        block_tokens = tokens.get_stripped_release_token_block(release_id, start, end)
                        blocklen += len(block_tokens)

                    rel_used = blocklen/len(rel_tokens)

                    art_added = 1 - blocklen/len(art_tokens)

                    #article subjectivity score
                    art_subj = scores.count_subj_art_sentences(article_id) / scores.count_all_art_sentences(article_id)
                    #article sentiment score
                    if scores.count_subj_art_sentences(article_id) == 0:
                        art_sents = 0
                    else:
                        pos_minus_neg = scores.count_pos_art_sentences(article_id) - scores.count_neg_art_sentences(article_id)
                        art_sent = pos_minus_neg / scores.count_subj_art_sentences(article_id)

                    sb.append('{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}\n'.format( \
#                    sb.append('{0} {1} {2} {3} {4} {5} {6} {7} {8} {9} {10}\n'.format( \
                        company_id, release_id, article_id, len(rel_tokens), len(art_tokens), rel_used, art_added, rel_subj, art_subj, rel_sent, art_sent))

        text = ''.join(sb)
        print text
Ejemplo n.º 6
0
def main():
    company_id = int(sys.argv[1])
    input_name = sys.argv[2]
    input_type = sys.argv[3]
    output_name = sys.argv[4]
    
    if input_type == 'm':
        mloader = MatchLoader(company_id, input_name)
        release_ids = mloader.get_release_ids()
        article_ids = mloader.get_article_ids()

    tw = TextWriter(company_id, release_ids, article_ids, output_name)
    tw.write()
Ejemplo n.º 7
0
def main():
    company_id = int(sys.argv[1])
    input_name = sys.argv[2]
    output_name = sys.argv[3]

    for company_id in range(21, 41):
        print 'PROCESSING COMPANY {0}'.format(company_id)

        mloader = MatchLoader(company_id, input_name)
        release_ids = mloader.get_release_ids()
        article_ids = mloader.get_article_ids()

        sw = SentenceWriter(company_id, release_ids, article_ids, output_name)
        sw.write_and_calculate()
Ejemplo n.º 8
0
    def _process(self, company_id):
        matchloader = MatchLoader(company_id, MATCHES_NAME)
        tokens = TokenLoader(company_id)
        releases = ReleaseLoader(company_id).get_releases()
        articles = ArticleLoader(company_id).get_articles()
        scores = ScoreLoader(company_id)

        for release_id in matcheloader.get_release_ids():
            release = releases[release_id]
            rel_tokens = tokens.get_stripped_release_token_block(release_id, 0, sys.maxint)
Ejemplo n.º 9
0
    def filter_by_min_len(self, min_len):
        matches = MatchLoader(self._company_id, self._match_name_in)
        maker = MatchMaker(self._company_id, self._match_name_out)

        for release_id in matches.get_release_ids():
            for article_id in matches.get_article_ids(release_id):

                blocks = matches.get_matches(release_id, article_id)
                newblocks = []
                for b in blocks:
                    start = b[1]  #release start
                    length = b[2]
                    end = start + length
                    tkns = self._tokens.get_stripped_release_token_block(
                        release_id, start, end)
                    if len(tkns) >= min_len:
                        newblocks.append(b)
                    if len(newblocks) > 0:
                        maker.add_blocks(release_id, article_id, newblocks)
        maker.save()
Ejemplo n.º 10
0
def test_matchloader():
    company_id = int(sys.argv[1])
    matches_name = sys.argv[2]
    m = MatchLoader(company_id, matches_name) 
    print 'Testing MatchLoader'
    print 'company-id: {0}'.format(company_id)
    print 'releases: {0}'.format(len(m.get_release_ids()))
    print 'articles: {0}'.format(len(m.get_article_ids()))
    print 'pairs: {0}'.format(m.count_rel_art_pairs())
    print 'blocks: {0}'.format(m.count_matching_blocks())
Ejemplo n.º 11
0
 def print_pairs(self):
     sb = []
     for company_id in range(1, 41): 
         matches = MatchLoader(company_id, self._match_name)
         for release_id in matches.get_release_ids():
Ejemplo n.º 12
0
 def __init__(self, company_id, matches_name):
     self._matchloader = MatchLoader(company_id, matches_name)
     self._tokens = TokenLoader(company_id)
     self._br = ConfigReader().get('MARKER_BR')
Ejemplo n.º 13
0
class BlockFinder(object):
    def __init__(self, company_id, matches_name):
        self._matchloader = MatchLoader(company_id, matches_name)
        self._tokens = TokenLoader(company_id)
        self._br = ConfigReader().get('MARKER_BR')

    def print_all_matching_blocks(self, min_len, max_len):
        for release_id in self._matchloader.get_release_ids():
            for article_id in self._matchloader.get_article_ids(release_id):
                blocks = self._matchloader.get_matches(release_id, article_id)
                for block in blocks:
                    i = block[0]
                    j = block[1]
                    k = block[2]

                    rel_match = self._tokens.get_stripped_release_token_block(
                        release_id, j, j + k)

                    if len(rel_match) >= min_len and len(rel_match) < max_len:
                        mb = ' '.join(rel_match)
                        mb = mb.replace(self._br, ' ')
                        print mb

    #prints blocks of min_length or larger occuring in more than one release -
    #   i.e., bad discriminators between releases
    def print_all_nondiscrim_release_blocks(self, min_len, max_len):
        blockset_dict = {}

        for release_id in self._matchloader.get_release_ids():

            blockset = set()  #set of blocks for current release
            blockset_dict[release_id] = blockset

            for article_id in self._matchloader.get_article_ids(release_id):
                blocks = self._matchloader.get_matches(release_id, article_id)
                for block in blocks:
                    i = block[0]
                    j = block[1]
                    k = block[2]

                    rel_match = self._tokens.get_stripped_release_token_block(
                        release_id, j, j + k)

                    if len(rel_match) >= min_len and len(rel_match) < max_len:
                        mb = ' '.join(rel_match)
                        mb = mb.replace(self._br, ' ')
                        mb = mb.lower().strip()
                        blockset.add(mb)

        #count occurances of each block per release
        bcounts = {}
        for release_id in blockset_dict:
            blockset = blockset_dict[release_id]
            for b in blockset:
                if b in bcounts:
                    bcounts[b] += 1
                else:
                    bcounts[b] = 1

        #print blocks which occur more than once per release
        result = [key for key in bcounts if bcounts[key] > 1]
        for r in result:
            print r
Ejemplo n.º 14
0
class MatchWriter(object):
    def __init__(self, company_id, matches_name):
        self._company_id = company_id
        self._matchloader = MatchLoader(company_id, matches_name)
        self._tokens = TokenLoader(company_id)
        self._releases = ReleaseLoader(company_id).get_releases()
        self._articles = ArticleLoader(company_id).get_articles()
        self._br = ConfigReader().get('MARKER_BR')

    def write_matches(self, output_path):
        html = self._build_html()
        filename = '{0}.html'.format(self._company_id)
        filepath = os.path.join(output_path, filename)
        self._write_html_to_file(filepath, html)

    def _build_html(self):
        sb = []
        counter = 0
        releases = self._get_sorted_releases()

        for release in releases:
            self._write_release_header(sb, release)
            articles = self._get_sorted_articles(release.id())

            for article in articles:
                #condition for id=35/32 only
                if self._company_id == '35':
                    delta = article.date() - release.date()
                    if delta.days >= TIME_DELTA and \
                            not (release.id() == 246 and article.id() == 944) and \
                            not (release.id() == 189 and article.id() == 1213) and \
                            not (release.id() == 71 and article.id() == 2557):
                        continue

                if self._company_id == '32':
                    delta = article.date() - release.date()
                    if delta.days >= TIME_DELTA:
                        continue

                blocks = self._matchloader.get_matches(release.id(),
                                                       article.id())

                self._write_article_summary(sb, blocks, release, article)
                self._write_texts(sb, blocks, release.id(), article.id())
                counter += 1

        print '{0}'.format(counter)
        return ''.join(sb)

    def _get_sorted_releases(self):
        ids = self._matchloader.get_release_ids()
        rels = [self._releases[id] for id in ids]
        rels.sort(key=lambda x: x.date())
        return rels

    def _get_sorted_articles(self, release_id):
        ids = self._matchloader.get_article_ids(release_id)
        arts = [self._articles[id] for id in ids]
        arts.sort(key=lambda x: x.date())
        return arts

    def _write_release_header(self, sb, release):
        sb.append('\n\t<tr>\n\t\t<td colspan="2" class="release-title">')
        sb.append('{0} --- {1} --- {2}\n\t\t</td>\n\t</tr>'.format( \
                release.id(), release.date().strftime('%B %d'), release.title()))

    def _write_article_summary(self, sb, blocks, release, article):
        sb.append('\n\t<tr><td colspan=2>')
        sb.append(
            '\n\t\t<table class="tbl-inner1" cellpadding="5" border="1"i>')

        sb.append(
            '\n\t\t\t<tr class="tbl-inner1-title"><td colspan="3" class="article-title">'
        )
        sb.append('R: {0} --- {1} --- {2}\n\t\t</td>\n\t</tr>'.format( \
                release.id(), release.date().strftime('%B %d'), release.title()))

        sb.append(
            '\n\t\t\t<tr class="tbl-inner1-title"><td colspan="3" class="article-title">'
        )
        sb.append('A: {0} --- {1} --- {2} --- {3}\n\t\t</td>\n\t</tr>'.format( \
                article.id(), article.date().strftime('%B %d'), article.headline(), article.pub()))

        sb.append(
            '\n\t\t\t<tr class="tbl-inner1-title"><td>#</td><td>length</td><td>match</td></tr>'
        )

        for count, block in enumerate(blocks):
            i = block[0]  #start in article
            j = block[1]  #start in release
            k = block[2]  #length

            rel_match = self._tokens.get_stripped_release_token_block(
                release.id(), j, j + k)
            art_match = self._tokens.get_stripped_article_token_block(
                article.id(), i, i + k)

            rel_temp = ' '.join(rel_match)
            art_temp = ' '.join(art_match)

            rel_temp = rel_temp.replace(self._br, ' ')
            art_temp = art_temp.replace(self._br, ' ')

            if rel_temp.lower() != art_temp.lower():
                print rel_temp.lower()
                print art_temp.lower()
                raise Exception("blocks don't match")

            sb.append('\n\t\t\t<tr valign="top">')
            sb.append('\n\t\t\t\t<td>{0}</td>'.format(count + 1))
            sb.append('\n\t\t\t\t<td>{0}</td>'.format(k))
            sb.append(
                '\n\t\t\t\t<td><span class="match match{0}">{1}</span>\n\t\t</td>'
                .format(count, rel_temp))
            sb.append('\n\t\t\t</tr>')

        sb.append('\n\t\t</table>')
        sb.append('\n\t</td></tr>')

    def _write_texts(self, sb, blocks, release_id, article_id):
        rel_tokens = self._tokens.get_release_tokens(release_id, False)
        art_tokens = self._tokens.get_article_tokens(article_id, False)
        rel_html = self._get_text(blocks, rel_tokens, POS_IN_BLOCK_REL)
        art_html = self._get_text(blocks, art_tokens, POS_IN_BLOCK_ART)

        sb.append('\n\t<tr valign="top">')
        sb.append('\n\t\t<td width="50%">{0}\n\t\t</td>'.format(rel_html))
        sb.append('\n\t\t<td width="50%">{0}\n\t\t</td>'.format(art_html))
        sb.append('\n\t</tr>')

    def _get_text(self, blocks, orig_tokens, pos_in_block):
        span_start = '<span class="match match{0}">'
        span_end = '</span>'
        #clone list
        tokens = orig_tokens[:]

        #sort by position in article
        blocks = sorted(blocks, key=itemgetter(pos_in_block))
        a = 0
        for count, block in enumerate(blocks):
            pos = block[pos_in_block]  #position in text
            k = block[2]  #length

            tokens.insert(pos + a, span_start.format(count))
            tokens.insert(pos + k + a + 1, span_end)
            a += 2

        html = ' '.join(tokens)
        html = html.replace(self._br, '<br/>')
        return html

    def _write_html_to_file(self, output_path, html):
        with open(output_path, 'w') as f:
            f.write('<html>\n<head>')
            f.write(
                '\n\t<link rel="stylesheet" type="text/css" href="styles.css">'
            )
            f.write('\n</head>\n<body>\n')
            f.write('\n<table class="tbl-main" cellpadding="5" border="1">')
            f.write(html)
            f.write('\n</table>')
            f.write('\n\n</body>\n</html>')