def divide_into_quotations(booklist): all_reviewwords, reviewdict = read_pubnames.get_names( 'brd_pubs_indexed1920s.tsv') longreviewnames = set() for rev in reviewdict.keys(): reviewparts = rev.split() if len(reviewparts) < 1: continue elif len(reviewparts[0]) > 4: longreviewnames.add(reviewparts[0]) publishers = [ 'Liverlght', 'Appleton', 'Baker', 'Barnes', 'Benziger', 'Bobbs', "Brentano's", 'Cassell', 'Century', 'Collier-Fox', 'Crowell', 'Ditson', 'Dodd', 'Doran', 'Doubleday', 'Dutton', 'Elder', 'Estes', 'Ginn', 'Goodspeed', 'Harper', 'Heath', 'Holt', 'Houghton', 'Knopf', 'Lane', 'Lippincott', 'Little', 'Liveright', 'Longmans', 'Macmillan', 'McBride', 'McClure', 'McGraw', 'Moffat', 'Oxford', 'Page', 'Pott', 'Putnam', 'Scribner', 'Simmons', 'Stokes', 'Walton', 'Warne', 'Wessels', 'Wilde', 'Wiley', 'Winston', 'Yale' ] lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \ ('reviewword', all_reviewwords), ('openparen', '\(.*'), ('closeparen', '.*\)'), ('fullstop', '.*\.'), ('commastop', '.*\,'), ('startdash', '—.*'), ('numeric', {'I:', 'II:', 'III:', 'IV:'}), ('titlecase', '[A-Z].*'), ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}), ('lineendingyear', '[\'"•■]\d+'), ('volandpgrange', '[0-9]+[:][0-9-]+'), ('somenumeric', '.?.?[0-9]{1,7}.?.?[0-9]*.?'), ('allcaps', '[A-Z\'\,]+'), ('dollarprice', '.*\$.?.?[0-9]{1,7}.?[0-9]*[,.:=]?'), ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'), ('hyphennumber', '[0-9]{1,3}[-—~]+[0-9]{3,7}[,.:=)]?'), ('openquote', '[\"\'“‘]+\S*'), ('plusorminus', '[\+\-\—]+'), ('reviewword', all_reviewwords), ('wordcount', '\d*0w[.]?'), ('OCRwordcount', '\S*Ow[.]?') ] wordcountregex = re.compile('\d*0w[.]?') ocrwordcountregex = re.compile('\S*Ow[.]?') rule_list = lexparse.patterns2rules(lexical_patterns) allquotes = [] plusmisreads = { '-4-', '4-', '1-', '-1-', '4—', '1—', '-|-', '-l-', '-)-', '—|—', '-I-', '-(-', '-f' } for book in booklist: lines = book.reviewlines accumulated = [] citationcount = 0 addtonext = '' skipnext = False for linecount, line in enumerate(lines): # We keep track of linecount because there are # characteristic kinds of noise early on, when trailing lines # of a citation get treated as part of the review. if len(addtonext) > 0: line = addtonext + ' ' + line addtonext = '' if skipnext: skipnext = False continue tokens = line.strip().split() if len(tokens) < 1: continue taglist = lexparse.apply_rule_list(rule_list, tokens) # in the first two lines we often have fragments # left over from the book bibliographical entry if linecount <= 3: trailingbibline = False for tags in taglist.tagseq: if 'hyphennumber' in tags or 'dollarprice' in tags or 'centprice' in tags: trailingbibline = True if trailingbibline: # get the existing publisher to see if it makes more sense # fused with something in this trailing line existingpubparts = book.publisher.split() if len(existingpubparts) > 0: existingpub = existingpubparts[-1].strip('-') else: existingpub = 'not a publisher' tokenssofar = [] for l in accumulated: tokenssofar.extend(l.strip().split()) tokenssofar.extend(tokens) tokenssofar = [x.strip('.,[]()-') for x in tokenssofar] for tok in tokenssofar: if tok in publishers: book.publisher = tok rejoined = existingpub + tok if rejoined in publishers: book.publisher = book.publisher.strip('-') + tok line = line + ' <endsubj>' accumulated.append(line) continue # Sometimes a book is followed by a summary that # is not attributed to any particular review. # The only way I have to identify this is, # that a) this is the first sequence of lines and # b) the next line opens with a quotation mark, # and there has been no other citation info provided # yet. if citationcount == 0 and len(accumulated) > 3: if 'openquote' in taglist.tagseq[0]: sentiment = '' review = 'summary' cite = 'summary' citationcount += 1 quote = Quotation(book, review, sentiment, cite, accumulated) allquotes.append(quote) accumulated = [] accumulated.append(line) # this line (opening with a quote) will be part of the next quotation matched = True continue numberwords = 0 reviewwords = 0 plusyet = False totalclues = 0 for word, tags in zip(taglist.stringseq, taglist.tagseq): if 'reviewword' in tags: reviewwords += 1 totalclues += 1 elif 'plusorminus' in tags and not plusyet: reviewwords += 0.5 totalclues += 1 plusyet = True elif 'monthabbrev' in tags: totalclues += 1 elif 'somenumeric' in tags and not '-' in word and not ',' in word: numberwords += 1 totalclues += 1 if word.endswith('w'): totalclues += 1 reviewwords += 0.5 elif ':' in word: totalclues += 1 reviewwords += 0.5 elif word.startswith('p'): totalclues += 1 reviewwords += 0.5 # fuzzy match in situations where everything is there except the review # because it could easily be ocr error if numberwords > 0 and totalclues > 2 and reviewwords < 0.9: firstword = taglist.stringseq[0] if len(firstword) > 3: for longname in longreviewnames: similarity = match_strings(firstword, longname) if similarity > .7: reviewwords += 1 totalclues += 1 break if numberwords > 0 and reviewwords > 0.9 and totalclues > 3: sentimentbits = [] numericyet = False publisherbits = [] citationbits = [] nextwordctr = 0 for word, tags in zip(taglist.stringseq, taglist.tagseq): nextwordctr += 1 if not numericyet and word == '+': sentimentbits.append('+') continue if not numericyet and word in plusmisreads: # e.g. '4-' is a fairly common ocr misread for + sentimentbits.append('+') continue if not numericyet and (word == '-' or word == '—' or word == '—-'): sentimentbits.append('-') continue if not numericyet and (word == '=-' or word == '--' or word == '-—'): sentimentbits.append('-') continue if not numericyet and (word == '==' or word == '=--' or word == '—-'): sentimentbits.append('-') continue if not numericyet and (word == '+-' or word == '+—' or word == '+='): sentimentbits.append('+') sentimentbits.append('-') continue if not numericyet and (word == '-+' or word == "—+" or word == '=+'): sentimentbits.append('-') sentimentbits.append('+') continue if not numericyet and (word == '++-' or word == '++—' or word == "++="): sentimentbits.append('+') sentimentbits.append('+') continue sentimentbits.append('-') if not numericyet and (word == '++'): sentimentbits.append('+') sentimentbits.append('+') continue if not numericyet and (word == '+++'): sentimentbits.append('+') sentimentbits.append('+') sentimentbits.append('+') continue if not numericyet and nextwordctr == 1 and word == "H": # this is a weird but common misread; however, it's risky # enough that we should only do it in first position sentimentbits.append('+') sentimentbits.append('-') continue if 'somenumeric' in tags: numericyet = True if not numericyet: publisherbits.append(word) else: citationbits.append(word) if numericyet and ('wordcount' in tags or 'OCRwordcount' in tags) and (nextwordctr < len( taglist.stringseq)): addtonext = ' '.join(taglist.stringseq[nextwordctr:]) break # if this line doesn't end with a word count, and the next one does? # probably a continuation if len(citationbits) > 0 and not wordcountregex.fullmatch( citationbits[-1]) and not ocrwordcountregex.fullmatch( citationbits[-1]): if linecount < (len(lines) - 1): wordsinnextline = lines[linecount + 1].strip().split() if len(wordsinnextline) > 0 and len( wordsinnextline ) < 3 and wordcountregex.fullmatch( wordsinnextline[-1]): citationbits.extend(wordsinnextline) skipnext = True sentiment = ' '.join(sentimentbits) review = ' '.join(publisherbits) cite = ' '.join(citationbits) citationcount += 1 quote = Quotation(book, review, sentiment, cite, accumulated) allquotes.append(quote) accumulated = [] else: # odds of review 1 or less accumulated.append(line) return allquotes
def divide_into_quotations(booklist): all_reviewwords, reviewdict = read_pubnames.get_names( 'brd_pubs_indexed1920s.tsv') reviewnames = set(reviewdict.keys()) lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \ ('reviewword', all_reviewwords), ('openparen', '\(.*'), ('closeparen', '.*\)'), ('fullstop', '.*\.'), ('commastop', '.*\,'), ('startdash', '—.*'), ('numeric', {'I:', 'II:', 'III:', 'IV:'}), ('titlecase', '[A-Z].*'), ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}), ('lineendingyear', '[\'"•■]\d+'), ('volandpgrange', '[0-9]+[:][0-9-]+'), ('somenumeric', '.?.?[0-9]{1,7}.?.?[0-9]*.?'), ('allcaps', '[A-Z\'\,]+'), ('dollarprice', '.*\$.?.?[0-9]{1,7}.?[0-9]*[,.:=]?'), ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'), ('hyphennumber', '[0-9]{1,3}[-—~]+[0-9]{3,7}[,.:=)]?'), ('openquote', '[\"\'“‘]+\S*'), ('plusorminus', '[\+\-\—]+'), ('reviewword', all_reviewwords), ('wordcount', '\d*0w[.]?') ] wordcountregex = re.compile('\d*0w[.]?') rule_list = lexparse.patterns2rules(lexical_patterns) allquotes = [] plusmisreads = { '-4-', '4-', '1-', '-1-', '4—', '1—', '-|-', '-l-', '-)-', '—|—', '-I-', '-(-', '-f' } for book in booklist: lines = book.reviewlines accumulated = [] citationcount = 0 addtonext = '' skipnext = False for linecount, line in enumerate(lines): # We keep track of linecount because there are # characteristic kinds of noise early on, when trailing lines # of a citation get treated as part of the review. if len(addtonext) > 0: line = addtonext + ' ' + line addtonext = '' if skipnext: skipnext = False continue tokens = line.strip().split() if len(tokens) < 1: continue taglist = lexparse.apply_rule_list(rule_list, tokens) # in the first two lines we often have fragments # left over from the book bibliographical entry if linecount <= 3: trailingbibline = False for tags in taglist.tagseq: if 'hyphennumber' in tags or 'dollarprice' in tags or 'centprice' in tags: trailingbibline = True if trailingbibline: line = line + ' <endsubj>' accumulated.append(line) continue # Sometimes a book is followed by a summary that # is not attributed to any particular review. # The only way I have to identify this is, # that a) this is the first sequence of lines and # b) the next line opens with a quotation mark, # and there has been no other citation info provided # yet. if citationcount == 0 and len(accumulated) > 3: if 'openquote' in taglist.tagseq[0]: sentiment = '' review = 'summary' cite = 'summary' citationcount += 1 quote = Quotation(book, review, sentiment, cite, accumulated) allquotes.append(quote) accumulated = [] accumulated.append(line) # this line (opening with a quote) will be part of the next quotation matched = True continue oddsofreview = 0 reviewwordyet = False for word, tags in zip(taglist.stringseq, taglist.tagseq): if 'reviewword' in tags and not reviewwordyet: oddsofreview += 1 reviewwordyet = True if 'plusorminus' in tags: oddsofreview += 1 if 'somenumeric' in tags and not '-' in word and not ',' in word: oddsofreview += 1 if (oddsofreview > 1 and linecount > 1) or oddsofreview > 2: sentimentbits = [] numericyet = False publisherbits = [] citationbits = [] nextwordctr = 0 for word, tags in zip(taglist.stringseq, taglist.tagseq): nextwordctr += 1 if not numericyet and word == '+': sentimentbits.append('+') continue if not numericyet and word in plusmisreads: # e.g. '4-' is a fairly common ocr misread for + sentimentbits.append('+') continue if not numericyet and (word == '-' or word == '—' or word == '—-'): sentimentbits.append('-') continue if not numericyet and (word == '=-' or word == '--' or word == '-—'): sentimentbits.append('-') continue if not numericyet and (word == '==' or word == '=--' or word == '—-'): sentimentbits.append('-') continue if not numericyet and (word == '+-' or word == '+—' or word == '+='): sentimentbits.append('+') sentimentbits.append('-') continue if not numericyet and (word == '-+' or word == "—+" or word == '=+'): sentimentbits.append('-') sentimentbits.append('+') continue if not numericyet and (word == '++-' or word == '++—' or word == "++="): sentimentbits.append('+') sentimentbits.append('+') continue sentimentbits.append('-') if not numericyet and (word == '++'): sentimentbits.append('+') sentimentbits.append('+') continue if not numericyet and (word == '+++'): sentimentbits.append('+') sentimentbits.append('+') sentimentbits.append('+') continue if not numericyet and nextwordctr == 1 and word == "H": # this is a weird but common misread; however, it's risky # enough that we should only do it in first position sentimentbits.append('+') sentimentbits.append('-') continue if 'somenumeric' in tags: numericyet = True if not numericyet: publisherbits.append(word) else: citationbits.append(word) if numericyet and 'wordcount' in tags and ( nextwordctr < len(taglist.stringseq)): addtonext = ' '.join(taglist.stringseq[nextwordctr:]) break # if this line doesn't end with a word count, and the next one does? # probably a continuation if len(citationbits) > 0 and not wordcountregex.fullmatch( citationbits[-1]): if linecount < (len(lines) - 1): wordsinnextline = lines[linecount + 1].strip().split() if len(wordsinnextline ) > 0 and wordcountregex.fullmatch( wordsinnextline[-1]): citationbits.extend(wordsinnextline) skipnext = True sentiment = ' '.join(sentimentbits) review = ' '.join(publisherbits) cite = ' '.join(citationbits) citationcount += 1 quote = Quotation(book, review, sentiment, cite, accumulated) allquotes.append(quote) accumulated = [] else: # odds of review 1 or less accumulated.append(line) return allquotes
# import glob, sys, csv, re from collections import Counter anynumregex = re.compile('\S{0,3}\d\S{0,3}') stopwords = {'the', 'from', 'of', 'and'} args = sys.argv year = args[1] import read_pubnames all_reviewwords, reviewdict = read_pubnames.get_names('brd_pubs_indexed1940s.tsv') all_review_codes = set([x.replace('.', '') for x in reviewdict.keys()]) all_review_codes.add('summary') publishers = [] with open('40spublishers.txt', encoding = 'utf-8') as f: for line in f: publishers.append(line.strip()) targetpaths = glob.glob('/media/secure_volume/brd/output/' +year + '*.tsv') if len(targetpaths) > 0: targetpath = targetpaths[0]