def __init__(self, company_id, matches_name): self._company_id = company_id self._matchloader = MatchLoader(company_id, matches_name) self._tokens = TokenLoader(company_id) self._releases = ReleaseLoader(company_id).get_releases() self._articles = ArticleLoader(company_id).get_articles() self._br = ConfigReader().get('MARKER_BR')
def _process(self, company_id): matchloader = MatchLoader(company_id, MATCHES_NAME) tokens = TokenLoader(company_id) releases = ReleaseLoader(company_id).get_releases() articles = ArticleLoader(company_id).get_articles() scores = ScoreLoader(company_id) for release_id in matcheloader.get_release_ids(): release = releases[release_id] rel_tokens = tokens.get_stripped_release_token_block(release_id, 0, sys.maxint)
def _load_art_hashes(self): hashes = {} articles = ArticleLoader(self._company_id).get_articles() for article_id in articles: article = articles[article_id] text = str(article.date()) + article.pub() + article.headline( ) + article.body() m = hashlib.md5() m.update(text) hashes[article_id] = m.hexdigest() return hashes
def __init__(self, company_id, release_ids, article_ids, output_name): self._company_id = company_id self._release_ids = release_ids self._article_ids = article_ids self._output_name = output_name self._releases = ReleaseLoader(company_id).get_releases() self._articles = ArticleLoader(company_id).get_articles() self._tokenizer = Tokenizer() self._lexicon = SubjLexiconLoader() self._make_dirs()
def build_subset_all(self): subset_maker = SubsetMaker() for company_id in range(1, 41): print 'Processing company {0}'.format(company_id) releases = ReleaseLoader(company_id).get_releases() for release_id in releases: subset_maker.add_release(company_id, release_id) articles = ArticleLoader(company_id).get_articles() for article_id in articles: subset_maker.add_article(company_id, article_id) subset_maker.save(SUBSET_ALL)
def test_duplicateloader(): company_id = int(sys.argv[1]) duplicates = DuplicateLoader(company_id) releases = ReleaseLoader(company_id).get_releases() articles = ArticleLoader(company_id).get_articles() print 'Testing DuplicateLoader' print 'company-id: {0}'.format(company_id) rel_dups = duplicates.get_release_duplicates() print 'RELEASE DUPLICATES: {0}'.format(len(rel_dups)) for d in rel_dups: r = releases[int(d)] print '{0} : {1}'.format(r.id(), r.title()) art_dups = duplicates.get_article_duplicates() print 'ARTICLE DUPLICATES: {0}'.format(len(art_dups)) for d in art_dups: a = articles[int(d)] print '{0} : {1}'.format(a.id(), a.headline())
def __init__(self, company_id, release_ids, article_ids, required_length, min_length, blocks_name_toignore): self._company_id = company_id self._release_ids = release_ids self._article_ids = article_ids self._required_length = required_length self._min_length = min_length self._tokens = TokenLoader(company_id) self._releases = ReleaseLoader(company_id).get_releases() self._articles = ArticleLoader(company_id).get_articles() self._ignoreblocks = BlockLoader(company_id, blocks_name_toignore).get_blocks() self._count_ignore = 0 dloader = DuplicateLoader(company_id) self._rel_duplicates = dloader.get_release_duplicates() self._art_duplicates = dloader.get_article_duplicates()
def test_tokenloader(): company_id = int(sys.argv[1]) t = TokenLoader(company_id) r = ReleaseLoader(company_id) a = ArticleLoader(company_id) print 'Testing TokenLoader' print 'company-id: {0}'.format(company_id) articles = a.get_articles() article_id = articles.itervalues().next().id() print 'article-id: {0}'.format(article_id) print 'TOKENS:' print t.get_article_tokens(article_id, False) releases = r.get_releases() release_id = releases.itervalues().next().id() print 'release-id: {0}'.format(release_id) print 'TOKENS:' print t.get_release_tokens(release_id, False)
def test_articleloader(): company_id = int(sys.argv[1]) a = ArticleLoader(company_id) print 'Testing ArticleLoader' print 'company-id: {0}'.format(company_id) print 'articles: {0}'.format(len(a.get_articles()))