def test_build(self): """ """ corpus = read(datapath) G = GraphCollection() G.build(corpus, coauthors) self.assertEqual(len(G), len(corpus.indices['date']))
def setUp(self): self.corpus = wos.read(datapath) self.label = 'ALBERTINDF' self.members = [ u'ALBERTINDF', u'ALBERTINID', u'ALBERTINID F', u'ALBERTINIDAVID', u'ALBERTINIDAVID F', u'ALBERTINIDF' ]
def test_init_build(self): """ Should build `graphs` if passed a corpus and method. """ corpus = read(datapath, index_fields=['ayjid', 'date']) G = GraphCollection(corpus, coauthors) self.assertEqual(len(G), len(corpus.indices['date']))
def test_build_streaming(self): """ """ corpus = read(datapath, streaming=True) G = GraphCollection() G.build(corpus, coauthors, slice_kwargs={'feature_name': 'authors'}) self.assertEqual(len(G), len(corpus.indices['date']))
def test_global_closeness_node(self): corpus = read(datapath) graph = coauthors(corpus) C = global_closeness_centrality(graph, node=('SEDA', 'B C')) self.assertIsInstance(C, float) self.assertGreater(C, 0)
def setUp(self): wosdatapath = '{0}/wos.txt'.format(datapath) papers = wos.read(wosdatapath) self.corpus = Corpus(papers, index_by='ayjid') self.corpus.slice('date', method='time_period', window_size=1) self.corpus.abstract_to_features()
def setUp(self): self.conn = psycopg2.connect(dbargs) self.cur = self.conn.cursor() try: self.cur.execute(PAPER_TABLE.format('tethne_test')) self.conn.commit() self.cur.execute(PAPER_TABLE.format('tethne_test_citations')) self.conn.commit() except: self.conn.commit() try: self.cur.execute("""DROP TABLE tethne_test;""") except: pass try: self.cur.execute("""DROP TABLE tethne_test_citations;""") except: pass self.conn.commit() self.cur.execute(PAPER_TABLE.format('tethne_test')) self.cur.execute(PAPER_TABLE.format('tethne_test_citations')) self.conn.commit() self.cur.close() self.sqlpapers = SQLPapers(self.conn, dbparams, table='tethne_test') self.papers = wos.read(datapath + '/wos.txt')
def test_iter(self): spapers = wos.read(datapath + '/wos.txt', papers=self.sqlpapers) i = 0 for s in spapers: self.assertIsInstance(s, Paper) i += 1 self.assertEqual(i, len(spapers)) # Should iterate over all Papers.
def test_upload(self): from tethne.readers import wos tethne_corpus = wos.read('tests/data/wos.txt') client, get, post = _new_mocked_client() client.upload(tethne_corpus, 'a test', 'WOS', 1000, skip_duplicates=False) # Authorization, corpus creation, and then six models. self.assertEqual(post.call_count, 8)
def setUp(self): corpus = read(datapath, index_by='wosid') corpus.index_feature('abstract', tokenize) xf = lambda f, c, C, DC: c * 3 corpus.features['xf'] = corpus.features['abstract'].transform(xf) self.model = LDAModel(corpus, featureset_name='xf') self.model.fit(Z=20, max_iter=500)
def handle(self, *args, **options): path = options.get('path')[0] label = options.get('label')[0] batch_size = options.get('batch_size')[0] tethne_corpus = wos.read(path, streaming=True) handler = CorpusHandler(tethne_corpus, label, batch_size) handler.run()
def test_global_closeness(self): corpus = read(datapath) graph = coauthors(corpus) C = global_closeness_centrality(graph) self.assertIsInstance(C, dict) for n in graph.nodes(): self.assertIn(n, C)
def setUp(self): self.temp = tempfile.mkdtemp() self.target = os.path.join(self.temp, 'test') self.corpus = read(datapath, index_by='wosid', index_features=['citations', 'authors']) self.dpath = self.target + '_docs.txt' self.mpath = self.target + '_meta.csv'
def test_init_build_streaming(self): """ Should build `graphs` if passed a corpus and method. """ corpus = read(datapath, streaming=True) G = GraphCollection(corpus, coauthors, slice_kwargs={'feature_name': 'authors'}) self.assertEqual(len(G), len(corpus.indices['date']))
def testBasic(self): self.corpus = read(datapath) self.graph = coauthors(self.corpus) #self.temp = tempfile.mkstemp() #self.prefix = os.path.join(self.temp,'.toSif') f, self.temp = tempfile.mkstemp(suffix='.toSif') to_sif(self.graph, self.temp) self.assertEqual(os.path.getsize(self.temp), 0)
def test_read_wos(self): """ When passed as a kwarg to the WoS reader, should be used as the container for parsed :class:`.Paper`\s. """ spapers = wos.read(datapath + '/wos.txt', papers=self.sqlpapers) self.assertIsInstance(spapers, SQLPapers) self.assertEqual(len(spapers), 10) self.assertIsInstance(spapers[0], Paper)
def test_upload_alt(self): """ This Corpus is big, and has special characters in it. """ from tethne.readers import wos tethne_corpus = wos.read('tests/data/1-500.txt') client, get, post = _new_mocked_client() client.upload(tethne_corpus, 'a test', 'WOS', 5000, skip_duplicates=False) # Authorization, corpus creation, and then six models. self.assertEqual(post.call_count, 38)
def test_upload_keyerror(self): """ This Corpus is big, and generated a keyerror this one time. """ from tethne.readers import wos tethne_corpus = wos.read('tests/data/21001-21500.txt') client, get, post = _new_mocked_client() client.upload(tethne_corpus, 'a test', 'WOS', 5000, skip_duplicates=False) # Authorization, corpus creation, and then six models. self.assertEqual(post.call_count, 20)
def test_corpus(self): """ Should be able to pass a SQLPaper to a :class:`.Corpus`\. """ # Read the data into the SQL database. spapers = wos.read(datapath + '/wos.txt', papers=self.sqlpapers) del spapers # Now start fresh and spin up a Corpus. sqlpapers = SQLPapers(self.conn, dbparams, table='tethne_test') c = Corpus(sqlpapers)
def setUp(self): """ Genereate a Corpus from some WoS data. """ wosdatapath = '{0}/wos.txt'.format(datapath) papers = wos.read(wosdatapath) pcgpath = cg_path + 'classes.Corpus.__init__[wos].png' with Profile(pcgpath): self.D = Corpus(papers, index_by='wosid')
def setUp(self): self.temp = tempfile.mkdtemp() self.target = os.path.join(self.temp, 'test') self.corpus = read(datapath, index_by='wosid', index_features=['citations', 'authors']) self.multpath = self.target + '-mult.dat' self.metapath = self.target + '-meta.dat' self.seqpath = self.target + '-seq.dat' self.vpath = self.target + '-vocab.dat'
def test_cosine_similarity_citations(self): corpus = read(datapath, index_by='wosid') top = corpus.top_features('citations', 1)[0][0] P = corpus.features['citations'].papers_containing(top) F_a = corpus.features['citations'].features[P[0]] F_b = corpus.features['citations'].features[P[1]] c = cosine_similarity(F_a, F_b) self.assertIsInstance(c, float) self.assertGreater(c, 0.)
def setUp(self): """ Genereate a :class:`.Corpus` from some WoS data. """ wosdatapath = '{0}/wos.txt'.format(datapath) papers = wos.read(wosdatapath) pcgpath = cg_path + 'persistence.hdf5.HDF5Corpus.__init__[wos].png' with Profile(pcgpath): self.D = HDF5Corpus(papers, index_by='wosid')
def test_kl_divergence_citations(self): corpus = read(datapath, index_by='wosid') top = corpus.top_features('citations', 1)[0][0] P = corpus.features['citations'].papers_containing(top) V_a = corpus.features['citations'].as_vector(P[0]) V_b = corpus.features['citations'].as_vector(P[1]) k = kl_divergence(V_a, V_b) self.assertIsInstance(k, float) self.assertGreater(k, 0.)
def parseDirectory(self): """ Recursively searches for WOS txt files and parses each one of them. The output CSV will have the following columns >>>header = ["WOSID","DATE","TITLE","LASTNAME","FIRSTNAME","JOURNAL","EMAILADDRESS",\ "PUBLISHER","SUBJECT","WC","AUTHOR_KEYWORDS","INSTITUTE","CO-AUTHORS"] :return: """ with (open(self.csv, 'wb')) as headerRecord: headerWriter = csv.writer(headerRecord, delimiter=",") headerWriter.writerow(header) for root, subfolders, files in os.walk(self.location): for file in files: if '.txt' in file: fullPath = os.path.join(root, file) papers = wos.read(fullPath) print "file Name", file print "total length of papers", len(papers) with open(self.csv, 'a') as csvfile: paperWriter = csv.writer(csvfile, delimiter=",") for paper in papers: set_of_authors = set(paper.authors_full) for author in set_of_authors: currentAuthorSet = set() currentAuthorSet.add(author) coauthorSet = set_of_authors.difference( currentAuthorSet) lastname = author[0] firstname = author[1] row = getattr(paper, 'wosid', ''), \ str(getattr(paper, 'date', '')), \ getattr(paper, 'title', ''), \ lastname, \ firstname, \ getattr(paper, 'journal', ''), \ getattr(paper, 'emailAddress', []),\ getattr(paper, 'publisher', ''),\ getattr(paper, 'subject', []),\ getattr(paper, 'WC', ''),\ getattr(paper, 'authorKeywords', []),\ getattr(paper, 'authorAddress', ""),\ list(coauthorSet) paperWriter.writerow(row)
def test_read_title(self): """ Passing the ``parse_only`` parameter to ``read`` limits the fields in the resulting :class:`.Paper` instances to only those specified. """ from tethne.readers.wos import WoSParser, read from tethne import Corpus, Paper datapath = './tethne/tests/data/wos2.txt' datapath_v = './tethne/tests/data/valentin.txt' corpus = read(datapath, parse_only=['title', 'date']) for e in corpus: self.assertFalse(hasattr(e, 'journal')) self.assertTrue(hasattr(e, 'date')) self.assertFalse(hasattr(e, 'authors_full')) self.assertFalse(hasattr(e, 'volume')) self.assertTrue(hasattr(e, 'title')) self.assertTrue(hasattr(e, 'wosid'))
def test_read(self): from tethne.readers.dfr import read from tethne import Corpus, Paper, FeatureSet import xml.etree.ElementTree as ET datapath = './tethne/tests/data/dfr' datapath_float_weights = './tethne/tests/data/dfr_float_weights' sample_datapath = './tethne/tests/data/test_citations_sample.xml' corpus = read(datapath, parse_only=['date', 'title']) self.assertIsInstance(corpus, Corpus) for e in corpus: self.assertFalse(hasattr(e, 'journal')) self.assertTrue(hasattr(e, 'date')) self.assertFalse(hasattr(e, 'authors_full')) self.assertFalse(hasattr(e, 'volume')) self.assertTrue(hasattr(e, 'doi'))
def parseFile(self): """ Parses a single WOS file passed in the input. The output CSV will have the following columns >>> header = ["WOSID","DATE","TITLE","LASTNAME","FIRSTNAME","JOURNAL","EMAILADDRESS",\ "PUBLISHER","SUBJECT","WC","AUTHOR_KEYWORDS","INSTITUTE","CO-AUTHORS"] :return: """ with (open(self.csv, 'wb')) as headerRecord: headerWriter = csv.writer(headerRecord, delimiter=",") headerWriter.writerow(header) papers = wos.read(self.location) print "total length of papers", len(papers) with open(self.csv, 'a') as csvfile: paperWriter = csv.writer(csvfile, delimiter=",") for paper in papers: set_of_authors = set(paper.authors_full) for author in set_of_authors: currentAuthorSet = set() currentAuthorSet.add(author) coauthorSet = set_of_authors.difference(currentAuthorSet) print coauthorSet lastname = author[0] firstname = author[1] row = getattr(paper, 'wosid', ''), \ str(getattr(paper, 'date', '')), \ getattr(paper, 'title', ''), \ lastname, \ firstname, \ getattr(paper, 'journal', ''), \ getattr(paper, 'emailAddress', []),\ getattr(paper, 'publisher', ''),\ getattr(paper, 'subject', []),\ getattr(paper, 'WC', ''),\ getattr(paper, 'authorKeywords', []),\ getattr(paper, 'authorAddress', ""),\ list(coauthorSet) paperWriter.writerow(row)
def setUp(self): self.corpus = read(datapath) self.graph = coauthors(self.corpus) self.temp = tempfile.mkdtemp() self.prefix = os.path.join(self.temp, 'textprefix')
def test_read(self): corpus = read(datapath, streaming=True) self.assertIsInstance(corpus, StreamingCorpus) derror = "{0} should be {1}, but is {2}" dauthorAddressError = "{0} should be {1} or {2}, but is {3}" for e in corpus: if hasattr(e, 'date'): self.assertIsInstance( e.date, int, derror.format('date', 'int', type(e.date))) uppererr = "Author names should be uppercase" if hasattr(e, 'authors_full'): self.assertIsInstance( e.authors_full, list, derror.format('authors_full', 'list', type(e.authors_full))) for a in e.authors_full: self.assertTrue(a[0].isupper(), uppererr) self.assertTrue(a[1].isupper(), uppererr) if hasattr(e, 'authors_init'): self.assertIsInstance( e.authors_init, list, derror.format('authors_init', 'list', type(e.authors_init))) for a in e.authors_init: self.assertTrue(a[0].isupper(), uppererr) self.assertTrue(a[1].isupper(), uppererr) if hasattr(e, 'journal'): self.assertIsInstance( e.journal, unicode, derror.format('journal', 'unicode', type(e.journal))) if hasattr(e, 'abstract'): self.assertIsInstance( e.abstract, unicode, derror.format('abstract', 'unicode', type(e.abstract))) if hasattr(e, 'WC'): self.assertIsInstance(e.WC, list, derror.format('WC', 'list', type(e.WC))) if hasattr(e, 'subject'): self.assertIsInstance( e.subject, list, derror.format('subject', 'list', type(e.subject))) if hasattr(e, 'authorKeywords'): self.assertIsInstance( e.authorKeywords, list, derror.format('authorKeywords', 'list', type(e.authorKeywords))) if hasattr(e, 'keywordsPlus'): self.assertIsInstance( e.keywordsPlus, list, derror.format('keywordsPlus', 'list', type(e.keywordsPlus))) if hasattr(e, 'doi'): self.assertIsInstance( e.doi, unicode, derror.format('doi', 'unicode', type(e.doi))) if hasattr(e, 'volume'): self.assertIsInstance( e.volume, unicode, derror.format('volume', 'unicode', type(e.volume))) if hasattr(e, 'title'): self.assertIsInstance( e.title, unicode, derror.format('title', 'unicode', type(e.title))) if hasattr(e, 'authorAddress'): self.assertTrue( (type(e.authorAddress) is list or type(e.authorAddress) is unicode), dauthorAddressError.format('authorAddress', 'unicode', 'list', type(e.authorAddress)))
def setUp(self): self.papers = wos.read(datapath + '/wos.txt')
def setUp(self): self.corpus = read(datapath) self.G = GraphCollection(self.corpus, coauthors) f, self.temp = tempfile.mkstemp(suffix='.xgmml')
def setUp(self): corpus = read(datapath, index_by='wosid') corpus.index_feature('abstract', tokenize) self.model = LDAModel(corpus, featureset_name='abstract') self.model.fit(Z=20, max_iter=500)
def setUp(self): self.corpus = read(datapath, index_by='wosid') self.corpus.index('date') self.feature_name = self.corpus.top_features('citations', topn=1)[0][0]
def setUp(self): self.corpus = read(datapath, index_by='wosid') self.corpus.index('date')
def setUp(self): self.papers = wos.read(datapath+'/wos.txt') self.citation,self.internal = papers.direct_citation(self.papers, node_attribs=['date'])
def test_read_nocorpus(self): papers = read(datapath, corpus=False) self.assertIsInstance(papers, list) self.assertIsInstance(papers[0], Paper)
def setUp(self): self.corpus = read(datapath) self.graph = coauthors(self.corpus) f, self.temp = tempfile.mkstemp(suffix='.graphml')
def setUp(self): self.corpus = read(datapath, index_by='wosid') self.corpus.index('date') self.G = GraphCollection(self.corpus, 'cocitation', method_kwargs={'min_weight': 4})
def setUp(self): self.papers = read(datapath, corpus=False)
def setUp(self): self.corpus = wos.read(datapath) self.corpus2 = wos.read(datapath2)
def setUp(self): self.corpus = read(datapath)
def test_read(self): corpus = read(datapath) self.assertIsInstance(corpus, Corpus)