예제 #1
0
파일: tests.py 프로젝트: XI-lab/axel
    def test_create_article(self):
        """Test article creation and deletion"""
        article = Article(venue_id=3, year=1999)
        full_path = os.path.join(settings.ROOT_DIR, 'articles', 'fixtures',
            'Hofmann-SIGIR99.pdf')
        with open(full_path, 'rb') as pdf:
            article.pdf.save(os.path.basename(full_path), File(pdf), save=True)
        article.save()

        collocs = Collocations.objects.all()
        self.assertTrue(collocs)
        self.assertEqual(collocs[0].ngram, 'probabilistic latent semantic indexing')
        article.delete()

        # Check it's empty now
        collocs = Collocations.objects.filter(count__gt=0).exists()
        self.assertFalse(collocs)
예제 #2
0
파일: import.py 프로젝트: XI-lab/axel
    def handle(self, *args, **options):
        dir = options['dir']
        venue = options['venue']
        year = int(options['year'])
        cluster = options['cluster']
        if not dir:
            raise CommandError("need to specify directory")
        if not venue:
            raise CommandError("need to specify venue")
        if not year:
            raise CommandError("need to specify year")
        if not cluster:
            raise CommandError("need to specify cluster")

        venue = Venue.objects.get(acronym=venue)

        # Traverse and import PDFs
        article_ids = []
        for root, dirs, files in os.walk(dir):
            for name in files:
                if name.endswith('.pdf'):
                    full_path = os.path.join(root, name)
                    article = Article(venue=venue, year=year, cluster_id=cluster)
                    with open(full_path, 'rb') as pdf:
                        article.pdf.save(name, File(pdf), save=True)
                    article.save()
                    article_ids.append(article.id)

        print 'Starting collocation population...'
        Article.create_collocations(cluster)

        print 'Starting merging... (dashed ngrams)'
        all_ngrams = set(ArticleCollocation.objects.values_list('ngram', flat=True).distinct())
        dashed_ngrams = [ngram for ngram in all_ngrams if '-' in ngram]
        for d_ngram in dashed_ngrams:
            if d_ngram.replace('-', ' ') in all_ngrams:
                print d_ngram