def addmetadata(self, row, table): self.author = table['author'][row] self.title = table['title'][row] self.date = utils.simple_date(row, table) genrelist = table['genres'][row].split(';') self.genres = set(genrelist) varietiesofnon = ['Bibliographies', 'Catalog', 'Dictionary', 'Encyclopedia', 'Handbooks', 'Indexes', 'Legislation', 'Directories', 'Statistics', 'Legal cases', 'Legal articles', 'Calendars', 'Autobiography', 'Biography', 'Letters', 'Essays', 'Speeches'] self.nonmetaflag = False for genre in varietiesofnon: if genre in self.genres: self.nonmetaflag = True
filelist = [x for x in filelist if x.endswith(".txt")] contexts = [] WINDOWRADIUS = 7 ctr = 0 for filename in filelist: htid = utils.pairtreelabel(filename.replace('.fic.txt', '')) if htid not in rows: print(htid) continue else: date = utils.simple_date(htid, table) filepath = os.path.join(sourcedir, filename) with open(filepath, encoding = 'utf-8') as f: filelines = f.readlines() pagelist = [filelines] # The wordcounter module expects a list of pages, each of which is a list of lines. # Ebooks have no pages -- at least as I currently receive them -- so we treat it # all as one giant page. tokenstream = modelingcounter.makestream(pagelist) newcontexts = modelingcounter.extract_snippets(tokenstream, WINDOWRADIUS, alltargetwords) approvedcontexts = []
date = int(fields[1]) if jgenre == 'poe': selecteddates[htid] = date selected.add(htid) rows, columns, table = utils.readtsv( '/Users/tunder/Dropbox/GenreProject/metadata/filteredpoetry.tsv') bydate = dict() for row in rows: if row in selected: continue date = utils.simple_date(row, table) if date in bydate: bydate[date].append(row) else: bydate[date] = [row] controlset = set() for theid, date in selecteddates.items(): found = False while not found: candidates = bydate[date] choice = random.sample(candidates, 1)[0] print(table["author"][choice]) print(table["title"][choice])
jgenre = fields[13] date = int(fields[1]) if jgenre == 'poe': selecteddates[htid] = date selected.add(htid) rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/filteredpoetry.tsv') bydate = dict() for row in rows: if row in selected: continue date = utils.simple_date(row, table) if date in bydate: bydate[date].append(row) else: bydate[date] = [row] controlset = set() for theid, date in selecteddates.items(): found = False while not found: candidates = bydate[date] choice = random.sample(candidates, 1)[0] print(table["author"][choice]) print(table["title"][choice])
rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/topicmodelingsample.tsv') sourcedir = "/Volumes/TARDIS/work/moneytexts/" for row in rows: filename = utils.pairtreefile(row) + ".fic.txt" filepath = os.path.join(sourcedir, filename) if os.path.isfile(filepath): tokencount, wordcount = count_words(filepath) else: print("Missing file: " + filepath) sys.exit(0) idcode = table["HTid"][row] date = str(utils.simple_date(row, table)) author = table["author"][row] title = table["title"][row] newrow = [idcode, date, tokencount, wordcount, author, title] outtable.append(newrow) print(counter) counter += 1 outfile = '/Users/tunder/Dropbox/GenreProject/metadata/improvedficsample.csv' with open(outfile, mode='w', encoding = 'utf-8') as f: f.write('idcode,date,tokens,words,author,title\n') writer = csv.writer(f) for row in outtable: writer.writerow(row)
date = int(fields[1]) if jgenre == 'fic': selecteddates[htid] = date selected.add(htid) rows, columns, table = utils.readtsv( '/Users/tunder/Dropbox/GenreProject/metadata/filteredfiction.tsv') bydate = dict() for row in rows: if row in selected: continue date = utils.simple_date(row, table) if date in bydate: bydate[date].append(row) else: bydate[date] = [row] controlset = set() for theid, date in selecteddates.items(): found = False while not found: candidates = bydate[date] choice = random.sample(candidates, 1)[0] print(table["author"][choice]) print(table["title"][choice])
rows, columns, table = utils.readtsv( '/Users/tunder/Dropbox/GenreProject/metadata/topicmodelingsample.tsv') sourcedir = "/Volumes/TARDIS/work/moneytexts/" for row in rows: filename = utils.pairtreefile(row) + ".fic.txt" filepath = os.path.join(sourcedir, filename) if os.path.isfile(filepath): tokencount, wordcount = count_words(filepath) else: print("Missing file: " + filepath) sys.exit(0) idcode = table["HTid"][row] date = str(utils.simple_date(row, table)) author = table["author"][row] title = table["title"][row] newrow = [idcode, date, tokencount, wordcount, author, title] outtable.append(newrow) print(counter) counter += 1 outfile = '/Users/tunder/Dropbox/GenreProject/metadata/improvedficsample.csv' with open(outfile, mode='w', encoding='utf-8') as f: f.write('idcode,date,tokens,words,author,title\n') writer = csv.writer(f) for row in outtable: writer.writerow(row)
jgenre = fields[13] date = int(fields[1]) if jgenre == 'fic': selecteddates[htid] = date selected.add(htid) rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/filteredfiction.tsv') bydate = dict() for row in rows: if row in selected: continue date = utils.simple_date(row, table) if date in bydate: bydate[date].append(row) else: bydate[date] = [row] controlset = set() for theid, date in selecteddates.items(): found = False while not found: candidates = bydate[date] choice = random.sample(candidates, 1)[0] print(table["author"][choice]) print(table["title"][choice])