def test_build(self):
     """
     """
     corpus = read(datapath)
     G = GraphCollection()
     G.build(corpus, coauthors)
     self.assertEqual(len(G), len(corpus.indices['date']))
 def setUp(self):
     self.corpus = wos.read(datapath)
     self.label = 'ALBERTINDF'
     self.members = [
         u'ALBERTINDF', u'ALBERTINID', u'ALBERTINID F', u'ALBERTINIDAVID',
         u'ALBERTINIDAVID F', u'ALBERTINIDF'
     ]
 def test_init_build(self):
     """
     Should build `graphs` if passed a corpus and method.
     """
     corpus = read(datapath, index_fields=['ayjid', 'date'])
     G = GraphCollection(corpus, coauthors)
     self.assertEqual(len(G), len(corpus.indices['date']))
 def test_build_streaming(self):
     """
     """
     corpus = read(datapath, streaming=True)
     G = GraphCollection()
     G.build(corpus, coauthors, slice_kwargs={'feature_name': 'authors'})
     self.assertEqual(len(G), len(corpus.indices['date']))
    def test_global_closeness_node(self):
        corpus = read(datapath)
        graph = coauthors(corpus)
        C = global_closeness_centrality(graph, node=('SEDA', 'B C'))

        self.assertIsInstance(C, float)
        self.assertGreater(C, 0)
    def setUp(self):
        wosdatapath = '{0}/wos.txt'.format(datapath)
        papers = wos.read(wosdatapath)

        self.corpus = Corpus(papers, index_by='ayjid')
        self.corpus.slice('date', method='time_period', window_size=1)
        self.corpus.abstract_to_features()
    def setUp(self):

        self.conn = psycopg2.connect(dbargs)
        self.cur = self.conn.cursor()

        try:
            self.cur.execute(PAPER_TABLE.format('tethne_test'))
            self.conn.commit()
            self.cur.execute(PAPER_TABLE.format('tethne_test_citations'))
            self.conn.commit()
        except:
            self.conn.commit()
            try:
                self.cur.execute("""DROP TABLE tethne_test;""")
            except:
                pass
            try:
                self.cur.execute("""DROP TABLE tethne_test_citations;""")
            except:
                pass
            self.conn.commit()

            self.cur.execute(PAPER_TABLE.format('tethne_test'))
            self.cur.execute(PAPER_TABLE.format('tethne_test_citations'))

            self.conn.commit()
        self.cur.close()
        self.sqlpapers = SQLPapers(self.conn, dbparams, table='tethne_test')


        self.papers = wos.read(datapath + '/wos.txt')
 def test_iter(self):
     spapers = wos.read(datapath + '/wos.txt', papers=self.sqlpapers)
     i = 0
     for s in spapers:
         self.assertIsInstance(s, Paper)
         i += 1
     self.assertEqual(i, len(spapers))   # Should iterate over all Papers.
Esempio n. 9
0
    def test_upload(self):
        from tethne.readers import wos
        tethne_corpus = wos.read('tests/data/wos.txt')

        client, get, post = _new_mocked_client()
        client.upload(tethne_corpus, 'a test', 'WOS', 1000, skip_duplicates=False)
        # Authorization, corpus creation, and then six models.
        self.assertEqual(post.call_count, 8)
    def setUp(self):
        corpus = read(datapath, index_by='wosid')
        corpus.index_feature('abstract', tokenize)

        xf = lambda f, c, C, DC: c * 3
        corpus.features['xf'] = corpus.features['abstract'].transform(xf)
        self.model = LDAModel(corpus, featureset_name='xf')
        self.model.fit(Z=20, max_iter=500)
Esempio n. 11
0
    def handle(self, *args, **options):
        path = options.get('path')[0]
        label = options.get('label')[0]
        batch_size = options.get('batch_size')[0]

        tethne_corpus = wos.read(path, streaming=True)
        handler = CorpusHandler(tethne_corpus, label, batch_size)
        handler.run()
    def test_global_closeness(self):
        corpus = read(datapath)
        graph = coauthors(corpus)
        C = global_closeness_centrality(graph)

        self.assertIsInstance(C, dict)
        for n in graph.nodes():
            self.assertIn(n, C)
Esempio n. 13
0
    def setUp(self):
        self.temp = tempfile.mkdtemp()
        self.target = os.path.join(self.temp, 'test')
        self.corpus = read(datapath,
                           index_by='wosid',
                           index_features=['citations', 'authors'])

        self.dpath = self.target + '_docs.txt'
        self.mpath = self.target + '_meta.csv'
 def test_init_build_streaming(self):
     """
     Should build `graphs` if passed a corpus and method.
     """
     corpus = read(datapath, streaming=True)
     G = GraphCollection(corpus,
                         coauthors,
                         slice_kwargs={'feature_name': 'authors'})
     self.assertEqual(len(G), len(corpus.indices['date']))
    def testBasic(self):
        self.corpus = read(datapath)
        self.graph = coauthors(self.corpus)
        #self.temp = tempfile.mkstemp()
        #self.prefix = os.path.join(self.temp,'.toSif')
        f, self.temp = tempfile.mkstemp(suffix='.toSif')

        to_sif(self.graph, self.temp)

        self.assertEqual(os.path.getsize(self.temp), 0)
    def test_read_wos(self):
        """
        When passed as a kwarg to the WoS reader, should be used as the
        container for parsed :class:`.Paper`\s.
        """

        spapers = wos.read(datapath + '/wos.txt', papers=self.sqlpapers)
        self.assertIsInstance(spapers, SQLPapers)
        self.assertEqual(len(spapers), 10)
        self.assertIsInstance(spapers[0], Paper)
Esempio n. 17
0
    def test_upload_alt(self):
        """
        This Corpus is big, and has special characters in it.
        """
        from tethne.readers import wos
        tethne_corpus = wos.read('tests/data/1-500.txt')

        client, get, post = _new_mocked_client()
        client.upload(tethne_corpus, 'a test', 'WOS', 5000, skip_duplicates=False)
        # Authorization, corpus creation, and then six models.
        self.assertEqual(post.call_count, 38)
Esempio n. 18
0
    def test_upload_keyerror(self):
        """
        This Corpus is big, and generated a keyerror this one time.
        """
        from tethne.readers import wos
        tethne_corpus = wos.read('tests/data/21001-21500.txt')

        client, get, post = _new_mocked_client()
        client.upload(tethne_corpus, 'a test', 'WOS', 5000, skip_duplicates=False)
        # Authorization, corpus creation, and then six models.
        self.assertEqual(post.call_count, 20)
    def test_corpus(self):
        """
        Should be able to pass a SQLPaper to a :class:`.Corpus`\.
        """
        # Read the data into the SQL database.
        spapers = wos.read(datapath + '/wos.txt', papers=self.sqlpapers)
        del spapers

        # Now start fresh and spin up a Corpus.
        sqlpapers = SQLPapers(self.conn, dbparams, table='tethne_test')
        c = Corpus(sqlpapers)
Esempio n. 20
0
    def setUp(self):
        """
        Genereate a Corpus from some WoS data.
        """
    
        wosdatapath = '{0}/wos.txt'.format(datapath)
        papers = wos.read(wosdatapath)

        pcgpath = cg_path + 'classes.Corpus.__init__[wos].png'
        with Profile(pcgpath):
            self.D = Corpus(papers, index_by='wosid')
Esempio n. 21
0
    def setUp(self):
        self.temp = tempfile.mkdtemp()
        self.target = os.path.join(self.temp, 'test')
        self.corpus = read(datapath,
                           index_by='wosid',
                           index_features=['citations', 'authors'])

        self.multpath = self.target + '-mult.dat'
        self.metapath = self.target + '-meta.dat'
        self.seqpath = self.target + '-seq.dat'
        self.vpath = self.target + '-vocab.dat'
Esempio n. 22
0
    def test_cosine_similarity_citations(self):
        corpus = read(datapath, index_by='wosid')

        top = corpus.top_features('citations', 1)[0][0]

        P = corpus.features['citations'].papers_containing(top)
        F_a = corpus.features['citations'].features[P[0]]
        F_b = corpus.features['citations'].features[P[1]]

        c = cosine_similarity(F_a, F_b)
        self.assertIsInstance(c, float)
        self.assertGreater(c, 0.)
    def setUp(self):
        """
        Genereate a :class:`.Corpus` from some WoS data.
        """
    
        wosdatapath = '{0}/wos.txt'.format(datapath)

        papers = wos.read(wosdatapath)
        
        pcgpath = cg_path + 'persistence.hdf5.HDF5Corpus.__init__[wos].png'
        with Profile(pcgpath):
            self.D = HDF5Corpus(papers, index_by='wosid')
Esempio n. 24
0
    def test_kl_divergence_citations(self):
        corpus = read(datapath, index_by='wosid')

        top = corpus.top_features('citations', 1)[0][0]

        P = corpus.features['citations'].papers_containing(top)
        V_a = corpus.features['citations'].as_vector(P[0])
        V_b = corpus.features['citations'].as_vector(P[1])

        k = kl_divergence(V_a, V_b)

        self.assertIsInstance(k, float)
        self.assertGreater(k, 0.)
Esempio n. 25
0
    def parseDirectory(self):
        """
        Recursively searches for WOS txt files and parses each one of them.
        The output CSV will have the following columns

        >>>header = ["WOSID","DATE","TITLE","LASTNAME","FIRSTNAME","JOURNAL","EMAILADDRESS",\
          "PUBLISHER","SUBJECT","WC","AUTHOR_KEYWORDS","INSTITUTE","CO-AUTHORS"]

        :return:
        """
        with (open(self.csv, 'wb')) as headerRecord:
            headerWriter = csv.writer(headerRecord, delimiter=",")
            headerWriter.writerow(header)
        for root, subfolders, files in os.walk(self.location):
            for file in files:
                if '.txt' in file:
                    fullPath = os.path.join(root, file)
                    papers = wos.read(fullPath)
                    print "file Name", file
                    print "total length of papers", len(papers)
                    with open(self.csv, 'a') as csvfile:
                        paperWriter = csv.writer(csvfile, delimiter=",")

                        for paper in papers:
                            set_of_authors = set(paper.authors_full)
                            for author in set_of_authors:
                                currentAuthorSet = set()
                                currentAuthorSet.add(author)
                                coauthorSet = set_of_authors.difference(
                                    currentAuthorSet)
                                lastname = author[0]
                                firstname = author[1]
                                row = getattr(paper, 'wosid', ''), \
                                      str(getattr(paper, 'date', '')), \
                                      getattr(paper, 'title', ''), \
                                      lastname, \
                                      firstname, \
                                      getattr(paper, 'journal', ''), \
                                      getattr(paper, 'emailAddress', []),\
                                      getattr(paper, 'publisher', ''),\
                                      getattr(paper, 'subject', []),\
                                      getattr(paper, 'WC', ''),\
                                      getattr(paper, 'authorKeywords', []),\
                                      getattr(paper, 'authorAddress', ""),\
                                      list(coauthorSet)
                                paperWriter.writerow(row)
    def test_read_title(self):
        """
        Passing the ``parse_only`` parameter to ``read`` limits the fields in
        the resulting :class:`.Paper` instances to only those specified.
        """
        from tethne.readers.wos import WoSParser, read
        from tethne import Corpus, Paper

        datapath = './tethne/tests/data/wos2.txt'
        datapath_v = './tethne/tests/data/valentin.txt'

        corpus = read(datapath, parse_only=['title', 'date'])
        for e in corpus:
            self.assertFalse(hasattr(e, 'journal'))
            self.assertTrue(hasattr(e, 'date'))
            self.assertFalse(hasattr(e, 'authors_full'))
            self.assertFalse(hasattr(e, 'volume'))
            self.assertTrue(hasattr(e, 'title'))
            self.assertTrue(hasattr(e, 'wosid'))
    def test_read(self):

        from tethne.readers.dfr import read
        from tethne import Corpus, Paper, FeatureSet
        import xml.etree.ElementTree as ET

        datapath = './tethne/tests/data/dfr'
        datapath_float_weights = './tethne/tests/data/dfr_float_weights'
        sample_datapath = './tethne/tests/data/test_citations_sample.xml'

        corpus = read(datapath, parse_only=['date', 'title'])

        self.assertIsInstance(corpus, Corpus)

        for e in corpus:
            self.assertFalse(hasattr(e, 'journal'))
            self.assertTrue(hasattr(e, 'date'))
            self.assertFalse(hasattr(e, 'authors_full'))
            self.assertFalse(hasattr(e, 'volume'))
            self.assertTrue(hasattr(e, 'doi'))
Esempio n. 28
0
    def parseFile(self):
        """
        Parses a single WOS file passed in the input.

        The output CSV will have the following columns

        >>> header = ["WOSID","DATE","TITLE","LASTNAME","FIRSTNAME","JOURNAL","EMAILADDRESS",\
          "PUBLISHER","SUBJECT","WC","AUTHOR_KEYWORDS","INSTITUTE","CO-AUTHORS"]

        :return:
        """
        with (open(self.csv, 'wb')) as headerRecord:
            headerWriter = csv.writer(headerRecord, delimiter=",")
            headerWriter.writerow(header)
        papers = wos.read(self.location)
        print "total length of papers", len(papers)
        with open(self.csv, 'a') as csvfile:
            paperWriter = csv.writer(csvfile, delimiter=",")
            for paper in papers:
                set_of_authors = set(paper.authors_full)
                for author in set_of_authors:
                    currentAuthorSet = set()
                    currentAuthorSet.add(author)
                    coauthorSet = set_of_authors.difference(currentAuthorSet)
                    print coauthorSet
                    lastname = author[0]
                    firstname = author[1]
                    row = getattr(paper, 'wosid', ''), \
                        str(getattr(paper, 'date', '')), \
                        getattr(paper, 'title', ''), \
                        lastname, \
                        firstname, \
                        getattr(paper, 'journal', ''), \
                        getattr(paper, 'emailAddress', []),\
                        getattr(paper, 'publisher', ''),\
                        getattr(paper, 'subject', []),\
                        getattr(paper, 'WC', ''),\
                        getattr(paper, 'authorKeywords', []),\
                        getattr(paper, 'authorAddress', ""),\
                        list(coauthorSet)
                    paperWriter.writerow(row)
 def setUp(self):
     self.corpus = read(datapath)
     self.graph = coauthors(self.corpus)
     self.temp = tempfile.mkdtemp()
     self.prefix = os.path.join(self.temp, 'textprefix')
Esempio n. 30
0
    def test_read(self):
        corpus = read(datapath, streaming=True)
        self.assertIsInstance(corpus, StreamingCorpus)

        derror = "{0} should be {1}, but is {2}"
        dauthorAddressError = "{0} should be {1} or {2}, but is {3}"
        for e in corpus:
            if hasattr(e, 'date'):
                self.assertIsInstance(
                    e.date, int, derror.format('date', 'int', type(e.date)))
            uppererr = "Author names should be uppercase"
            if hasattr(e, 'authors_full'):
                self.assertIsInstance(
                    e.authors_full, list,
                    derror.format('authors_full', 'list',
                                  type(e.authors_full)))
                for a in e.authors_full:
                    self.assertTrue(a[0].isupper(), uppererr)
                    self.assertTrue(a[1].isupper(), uppererr)
            if hasattr(e, 'authors_init'):
                self.assertIsInstance(
                    e.authors_init, list,
                    derror.format('authors_init', 'list',
                                  type(e.authors_init)))
                for a in e.authors_init:
                    self.assertTrue(a[0].isupper(), uppererr)
                    self.assertTrue(a[1].isupper(), uppererr)
            if hasattr(e, 'journal'):
                self.assertIsInstance(
                    e.journal, unicode,
                    derror.format('journal', 'unicode', type(e.journal)))
            if hasattr(e, 'abstract'):
                self.assertIsInstance(
                    e.abstract, unicode,
                    derror.format('abstract', 'unicode', type(e.abstract)))
            if hasattr(e, 'WC'):
                self.assertIsInstance(e.WC, list,
                                      derror.format('WC', 'list', type(e.WC)))
            if hasattr(e, 'subject'):
                self.assertIsInstance(
                    e.subject, list,
                    derror.format('subject', 'list', type(e.subject)))
            if hasattr(e, 'authorKeywords'):
                self.assertIsInstance(
                    e.authorKeywords, list,
                    derror.format('authorKeywords', 'list',
                                  type(e.authorKeywords)))
            if hasattr(e, 'keywordsPlus'):
                self.assertIsInstance(
                    e.keywordsPlus, list,
                    derror.format('keywordsPlus', 'list',
                                  type(e.keywordsPlus)))
            if hasattr(e, 'doi'):
                self.assertIsInstance(
                    e.doi, unicode, derror.format('doi', 'unicode',
                                                  type(e.doi)))
            if hasattr(e, 'volume'):
                self.assertIsInstance(
                    e.volume, unicode,
                    derror.format('volume', 'unicode', type(e.volume)))

            if hasattr(e, 'title'):
                self.assertIsInstance(
                    e.title, unicode,
                    derror.format('title', 'unicode', type(e.title)))

            if hasattr(e, 'authorAddress'):
                self.assertTrue(
                    (type(e.authorAddress) is list
                     or type(e.authorAddress) is unicode),
                    dauthorAddressError.format('authorAddress', 'unicode',
                                               'list', type(e.authorAddress)))
Esempio n. 31
0
 def setUp(self):
     self.papers = wos.read(datapath + '/wos.txt') 
Esempio n. 32
0
 def setUp(self):
     self.corpus = read(datapath)
     self.G = GraphCollection(self.corpus, coauthors)
     f, self.temp = tempfile.mkstemp(suffix='.xgmml')
 def setUp(self):
     corpus = read(datapath, index_by='wosid')
     corpus.index_feature('abstract', tokenize)
     self.model = LDAModel(corpus, featureset_name='abstract')
     self.model.fit(Z=20, max_iter=500)
 def setUp(self):
     self.corpus = read(datapath, index_by='wosid')
     self.corpus.index('date')
     self.feature_name = self.corpus.top_features('citations', topn=1)[0][0]
 def setUp(self):
     self.corpus = read(datapath, index_by='wosid')
     self.corpus.index('date')
Esempio n. 36
0
 def setUp(self):
     self.papers = wos.read(datapath+'/wos.txt')
     self.citation,self.internal = papers.direct_citation(self.papers, node_attribs=['date'])
Esempio n. 37
0
 def test_read_nocorpus(self):
     papers = read(datapath, corpus=False)
     self.assertIsInstance(papers, list)
     self.assertIsInstance(papers[0], Paper)
 def setUp(self):
     self.corpus = read(datapath)
     self.graph = coauthors(self.corpus)
     f, self.temp = tempfile.mkstemp(suffix='.graphml')
 def setUp(self):
     self.corpus = read(datapath, index_by='wosid')
     self.corpus.index('date')
     self.G = GraphCollection(self.corpus,
                              'cocitation',
                              method_kwargs={'min_weight': 4})
 def setUp(self):
     self.papers = read(datapath, corpus=False)
Esempio n. 41
0
 def setUp(self):
     self.corpus = wos.read(datapath)
     self.corpus2 = wos.read(datapath2)
 def setUp(self):
     self.corpus = read(datapath)
Esempio n. 43
0
 def test_read(self):
     corpus = read(datapath)
     self.assertIsInstance(corpus, Corpus)