def rlinput(prompt, prefill=''): def hook(): readline.insert_text(prefill) readline.set_startup_hook(hook) try: return raw_input(prompt) finally: readline.set_startup_hook()
def main(infile, outfile, start_year, start_abstract): # check arguments if not os.path.exists(infile): print('%s file not found, exiting.' % infile) sys.exit(1) if start_abstract > -1 and start_year == -1: print('Warning: start_abstract specified, but no start year; ' + 'please specify starting year with -y.') sys.exit(1) # Read in JSONL file with open(infile, 'r') as jf: docs = map(lambda x: json.loads(x), jf) print 'Obtained %d docs.' % len(docs) # Sort by filename docs = sorted(docs, key=lambda d: d['file']) for doc in docs: # If you want to skip to a particular abstract, use these lines. (yr, abst) = map(int, doc['file'].split('/')[-1].split('.')[0].split('_')) if start_year > -1 and yr < start_year: continue if start_abstract > -1 and yr == start_year and abst < start_abstract: continue readline.set_startup_hook() s = raw_input('Review %s? [Y/n/q]' % doc['file']) if s == 'n' or s == 'N': continue if s == 'q' or s == 'Q': break # Show the first few lines print('-------------------------------') lines = doc['content'].split('\n') non_empty = [l for l in lines if len(l) > 1] for l in non_empty[:6]: print(l) update_field(doc, 'grobid:header_Title') update_field(doc, 'grobid:header_Authors') print('\n') # Write out new JSONL file print('Writing to %s' % outfile) count = 0 with open(outfile, 'wb', 1) as out: for doc in docs: out.write(json.dumps(doc)) out.write('\n') count += 1 print('Stored %d documents in %s' % (count, outfile))
def update_field(doc, grobid_f_name): value = doc['metadata'].get(grobid_f_name, '') if value == '' and grobid_f_name == 'grobid:header_Title': # If title is unknown, try to guess it from content. value ='[^\.]+\.[^\.\n]+\n+([^\.]+)\.', doc['content']).group(1).title() # Standardize title capitalization if value.isupper(): value = string.capwords(doc['metadata'].get(grobid_f_name, '')) readline.set_startup_hook( lambda: readline.insert_text(value.encode(sys.stdin.encoding))) new_value = raw_input('%s: Edit the %s: ' % (doc['file'].split('/')[-1], grobid_f_name)) new_value = unicode(new_value, 'utf8') doc['metadata'][grobid_f_name] = new_value print('New value: %s' % doc['metadata'][grobid_f_name])
try: import gnureadline as readline except ImportError: import readline def startup_hook(): readline.insert_text('from startup_hook') def pre_input_hook(): readline.insert_text(' from pre_input_hook') readline.redisplay() readline.set_startup_hook(startup_hook) readline.set_pre_input_hook(pre_input_hook) readline.parse_and_bind('tab: complete') while True: line = input('Prompt ("stop" to quit): ') if line == 'stop': break print('ENTERED: {!r}'.format(line))
# Title, authors, primaryauthor (+ venue, year?) default_value = d[f] # If title is Unknown, try to guess it from content. try: if f == 'title' and default_value == 'Unknown': default_value ='[^\.]+\.[^\.\n]+\n+([^\.]+)\.', d['content']).group(1).title() except: pass # Strip numbers out of authors if f == 'authors': # Note: authors is a string inside a list, hence [0] default_value = re.sub(r'[0-9]', '', unicode(default_value[0])) readline.set_startup_hook(lambda: readline.insert_text(default_value)) new_value = raw_input('Edit the %s: ' % f) new_value = unicode(new_value, 'utf8') if new_value != d[f]: d[f_old] = d[f] print 'Updating Solr. Hang onto your hat!' d[f] = new_value # This also seems to commit by default, yay. # Why don't I need to put f_old here? s.add([d], fieldUpdates={f: 'set'}) # Copyright 2017, by the California Institute of Technology. ALL # RIGHTS RESERVED. United States Government Sponsorship # acknowledged. Any commercial use must be negotiated with the Office
# """ """ #end_pymotw_header try: import gnureadline as readline except ImportError: import readline def startup_hook(): readline.insert_text('from startup_hook') def pre_input_hook(): readline.insert_text(' from pre_input_hook') readline.redisplay() readline.set_startup_hook(startup_hook) readline.set_pre_input_hook(pre_input_hook) readline.parse_and_bind('tab: complete') while True: line = input('Prompt ("stop" to quit): ') if line == 'stop': break print('ENTERED: {!r}'.format(line))
def rlinput(prompt, prefill=''): readline.set_startup_hook(lambda: readline.insert_text(prefill)) try: return input(prompt) finally: readline.set_startup_hook()