def test_create_article(self): """Test article creation and deletion""" article = Article(venue_id=3, year=1999) full_path = os.path.join(settings.ROOT_DIR, 'articles', 'fixtures', 'Hofmann-SIGIR99.pdf') with open(full_path, 'rb') as pdf: article.pdf.save(os.path.basename(full_path), File(pdf), save=True) article.save() collocs = Collocations.objects.all() self.assertTrue(collocs) self.assertEqual(collocs[0].ngram, 'probabilistic latent semantic indexing') article.delete() # Check it's empty now collocs = Collocations.objects.filter(count__gt=0).exists() self.assertFalse(collocs)
def generate_temp_article(text): # TODO: make this Article class method from axel.articles.models import Article, Venue, TestCollocations import json venue = Venue.objects.get(acronym='SIGIR') stemmed_text = nlp.Stemmer.stem_wordnet(text) index = json.dumps(nlp.build_ngram_index(stemmed_text)) article = Article(text=text, cluster_id='CS_COLLOCS', venue=venue, year=2013, stemmed_text=stemmed_text, index=index) # TODO: extract title and abstract article.save_base(raw=True) article._create_collocations(True) for test_colloc in TestCollocations.objects.filter(article=article): obj = article.CollocationModel(ngram=test_colloc.ngram, count=test_colloc.count, article=article, total_count=0, extra_fields={}) obj.save() TestCollocations.objects.filter(article=article).delete() return article
def handle(self, *args, **options): dir = options['dir'] venue = options['venue'] year = int(options['year']) cluster = options['cluster'] if not dir: raise CommandError("need to specify directory") if not venue: raise CommandError("need to specify venue") if not year: raise CommandError("need to specify year") if not cluster: raise CommandError("need to specify cluster") venue = Venue.objects.get(acronym=venue) # Traverse and import PDFs article_ids = [] for root, dirs, files in os.walk(dir): for name in files: if name.endswith('.pdf'): full_path = os.path.join(root, name) article = Article(venue=venue, year=year, cluster_id=cluster) with open(full_path, 'rb') as pdf: article.pdf.save(name, File(pdf), save=True) article.save() article_ids.append(article.id) print 'Starting collocation population...' Article.create_collocations(cluster) print 'Starting merging... (dashed ngrams)' all_ngrams = set(ArticleCollocation.objects.values_list('ngram', flat=True).distinct()) dashed_ngrams = [ngram for ngram in all_ngrams if '-' in ngram] for d_ngram in dashed_ngrams: if d_ngram.replace('-', ' ') in all_ngrams: print d_ngram