예제 #1
0
    def addmetadata(self, row, table):
        self.author = table['author'][row]
        self.title = table['title'][row]
        self.date = utils.simple_date(row, table)
        genrelist = table['genres'][row].split(';')
        self.genres = set(genrelist)

        varietiesofnon = ['Bibliographies', 'Catalog', 'Dictionary', 'Encyclopedia', 'Handbooks', 'Indexes', 'Legislation', 'Directories', 'Statistics', 'Legal cases', 'Legal articles', 'Calendars', 'Autobiography', 'Biography', 'Letters', 'Essays', 'Speeches']

        self.nonmetaflag = False
        for genre in varietiesofnon:
            if genre in self.genres:
                self.nonmetaflag = True
예제 #2
0
    def addmetadata(self, row, table):
        self.author = table['author'][row]
        self.title = table['title'][row]
        self.date = utils.simple_date(row, table)
        genrelist = table['genres'][row].split(';')
        self.genres = set(genrelist)

        varietiesofnon = ['Bibliographies', 'Catalog', 'Dictionary', 'Encyclopedia', 'Handbooks', 'Indexes', 'Legislation', 'Directories', 'Statistics', 'Legal cases', 'Legal articles', 'Calendars', 'Autobiography', 'Biography', 'Letters', 'Essays', 'Speeches']

        self.nonmetaflag = False
        for genre in varietiesofnon:
            if genre in self.genres:
                self.nonmetaflag = True
filelist = [x for x in filelist if x.endswith(".txt")]
contexts = []

WINDOWRADIUS = 7

ctr = 0

for filename in filelist:

    htid = utils.pairtreelabel(filename.replace('.fic.txt', ''))

    if htid not in rows:
        print(htid)
        continue
    else:
        date = utils.simple_date(htid, table)

    filepath = os.path.join(sourcedir, filename)
    with open(filepath, encoding = 'utf-8') as f:
        filelines = f.readlines()
    pagelist = [filelines]

    # The wordcounter module expects a list of pages, each of which is a list of lines.
    # Ebooks have no pages -- at least as I currently receive them -- so we treat it
    # all as one giant page.

    tokenstream = modelingcounter.makestream(pagelist)

    newcontexts = modelingcounter.extract_snippets(tokenstream,  WINDOWRADIUS, alltargetwords)

    approvedcontexts = []
예제 #4
0
        date = int(fields[1])

        if jgenre == 'poe':
            selecteddates[htid] = date
            selected.add(htid)

rows, columns, table = utils.readtsv(
    '/Users/tunder/Dropbox/GenreProject/metadata/filteredpoetry.tsv')

bydate = dict()

for row in rows:
    if row in selected:
        continue

    date = utils.simple_date(row, table)

    if date in bydate:
        bydate[date].append(row)
    else:
        bydate[date] = [row]

controlset = set()

for theid, date in selecteddates.items():
    found = False
    while not found:
        candidates = bydate[date]
        choice = random.sample(candidates, 1)[0]
        print(table["author"][choice])
        print(table["title"][choice])
        jgenre = fields[13]
        date = int(fields[1])

        if jgenre == 'poe':
            selecteddates[htid] = date
            selected.add(htid)

rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/filteredpoetry.tsv')

bydate = dict()

for row in rows:
    if row in selected:
        continue

    date = utils.simple_date(row, table)

    if date in bydate:
        bydate[date].append(row)
    else:
        bydate[date] = [row]

controlset = set()

for theid, date in selecteddates.items():
    found = False
    while not found:
        candidates = bydate[date]
        choice = random.sample(candidates, 1)[0]
        print(table["author"][choice])
        print(table["title"][choice])
rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/topicmodelingsample.tsv')

sourcedir = "/Volumes/TARDIS/work/moneytexts/"

for row in rows:
    filename = utils.pairtreefile(row) + ".fic.txt"
    filepath = os.path.join(sourcedir, filename)
    if os.path.isfile(filepath):
        tokencount, wordcount = count_words(filepath)
    else:
        print("Missing file: " + filepath)
        sys.exit(0)

    idcode = table["HTid"][row]
    date = str(utils.simple_date(row, table))
    author = table["author"][row]
    title = table["title"][row]
    newrow = [idcode, date, tokencount, wordcount, author, title]
    outtable.append(newrow)
    print(counter)
    counter += 1

outfile = '/Users/tunder/Dropbox/GenreProject/metadata/improvedficsample.csv'
with open(outfile, mode='w', encoding = 'utf-8') as f:
    f.write('idcode,date,tokens,words,author,title\n')
    writer = csv.writer(f)
    for row in outtable:
        writer.writerow(row)

예제 #7
0
        date = int(fields[1])

        if jgenre == 'fic':
            selecteddates[htid] = date
            selected.add(htid)

rows, columns, table = utils.readtsv(
    '/Users/tunder/Dropbox/GenreProject/metadata/filteredfiction.tsv')

bydate = dict()

for row in rows:
    if row in selected:
        continue

    date = utils.simple_date(row, table)

    if date in bydate:
        bydate[date].append(row)
    else:
        bydate[date] = [row]

controlset = set()

for theid, date in selecteddates.items():
    found = False
    while not found:
        candidates = bydate[date]
        choice = random.sample(candidates, 1)[0]
        print(table["author"][choice])
        print(table["title"][choice])
rows, columns, table = utils.readtsv(
    '/Users/tunder/Dropbox/GenreProject/metadata/topicmodelingsample.tsv')

sourcedir = "/Volumes/TARDIS/work/moneytexts/"

for row in rows:
    filename = utils.pairtreefile(row) + ".fic.txt"
    filepath = os.path.join(sourcedir, filename)
    if os.path.isfile(filepath):
        tokencount, wordcount = count_words(filepath)
    else:
        print("Missing file: " + filepath)
        sys.exit(0)

    idcode = table["HTid"][row]
    date = str(utils.simple_date(row, table))
    author = table["author"][row]
    title = table["title"][row]
    newrow = [idcode, date, tokencount, wordcount, author, title]
    outtable.append(newrow)
    print(counter)
    counter += 1

outfile = '/Users/tunder/Dropbox/GenreProject/metadata/improvedficsample.csv'
with open(outfile, mode='w', encoding='utf-8') as f:
    f.write('idcode,date,tokens,words,author,title\n')
    writer = csv.writer(f)
    for row in outtable:
        writer.writerow(row)
예제 #9
0
        jgenre = fields[13]
        date = int(fields[1])

        if jgenre == 'fic':
            selecteddates[htid] = date
            selected.add(htid)

rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/filteredfiction.tsv')

bydate = dict()

for row in rows:
    if row in selected:
        continue

    date = utils.simple_date(row, table)

    if date in bydate:
        bydate[date].append(row)
    else:
        bydate[date] = [row]

controlset = set()

for theid, date in selecteddates.items():
    found = False
    while not found:
        candidates = bydate[date]
        choice = random.sample(candidates, 1)[0]
        print(table["author"][choice])
        print(table["title"][choice])