Exemple #1
0
    def test_create_article(self):
        """Test article creation and deletion"""
        article = Article(venue_id=3, year=1999)
        full_path = os.path.join(settings.ROOT_DIR, 'articles', 'fixtures',
            'Hofmann-SIGIR99.pdf')
        with open(full_path, 'rb') as pdf:
            article.pdf.save(os.path.basename(full_path), File(pdf), save=True)
        article.save()

        collocs = Collocations.objects.all()
        self.assertTrue(collocs)
        self.assertEqual(collocs[0].ngram, 'probabilistic latent semantic indexing')
        article.delete()

        # Check it's empty now
        collocs = Collocations.objects.filter(count__gt=0).exists()
        self.assertFalse(collocs)
Exemple #2
0
 def generate_temp_article(text):
     # TODO: make this Article class method
     from axel.articles.models import Article, Venue, TestCollocations
     import json
     venue = Venue.objects.get(acronym='SIGIR')
     stemmed_text = nlp.Stemmer.stem_wordnet(text)
     index = json.dumps(nlp.build_ngram_index(stemmed_text))
     article = Article(text=text, cluster_id='CS_COLLOCS', venue=venue, year=2013,
                       stemmed_text=stemmed_text, index=index)
     # TODO: extract title and abstract
     article.save_base(raw=True)
     article._create_collocations(True)
     for test_colloc in TestCollocations.objects.filter(article=article):
         obj = article.CollocationModel(ngram=test_colloc.ngram, count=test_colloc.count,
                                        article=article, total_count=0, extra_fields={})
         obj.save()
     TestCollocations.objects.filter(article=article).delete()
     return article
Exemple #3
0
    def handle(self, *args, **options):
        dir = options['dir']
        venue = options['venue']
        year = int(options['year'])
        cluster = options['cluster']
        if not dir:
            raise CommandError("need to specify directory")
        if not venue:
            raise CommandError("need to specify venue")
        if not year:
            raise CommandError("need to specify year")
        if not cluster:
            raise CommandError("need to specify cluster")

        venue = Venue.objects.get(acronym=venue)

        # Traverse and import PDFs
        article_ids = []
        for root, dirs, files in os.walk(dir):
            for name in files:
                if name.endswith('.pdf'):
                    full_path = os.path.join(root, name)
                    article = Article(venue=venue, year=year, cluster_id=cluster)
                    with open(full_path, 'rb') as pdf:
                        article.pdf.save(name, File(pdf), save=True)
                    article.save()
                    article_ids.append(article.id)

        print 'Starting collocation population...'
        Article.create_collocations(cluster)

        print 'Starting merging... (dashed ngrams)'
        all_ngrams = set(ArticleCollocation.objects.values_list('ngram', flat=True).distinct())
        dashed_ngrams = [ngram for ngram in all_ngrams if '-' in ngram]
        for d_ngram in dashed_ngrams:
            if d_ngram.replace('-', ' ') in all_ngrams:
                print d_ngram