def handle_args(): parser = argparse.ArgumentParser( description="Parse a QLD Members' Interests PDF to a database.") parser.add_argument('input', help='the PDF file to parse') parser.add_argument('--dropall', action='store_true', help='drop all tables before processing begins') return parser.parse_args()
def get_args(): parser = argparse.ArgumentParser() parser.add_argument( '-f', '--file', help='input pdf', default= '/Users/Dhruv/Downloads/Sample roll call vote PDF_multiple columns[2].pdf' ) return parser.parse_args()
def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_file", help="complete location of the input pdf file", required=True) parser.add_argument("-d", "--destination_file", help="complete location where output csv file will be created", required=False) args = parser.parse_args() input_file = None output_file_location = None if args.input_file: input_file = args.input_file if args.destination_file: output_file_location = args.destination_file return input_file, output_file_location
def parse_arguments(): """ Parser. """ parser = argparse.ArgumentParser() parser.add_argument("--infile", "-i", default='/raid/antoloui/Master-thesis/Data/QA/Cisco_CCNA.pdf', type=str, help="The input file from which to extract text.", ) parser.add_argument("--outdir", "-o", default=None, type=str, help="The output directory.", ) arguments, _ = parser.parse_known_args() return arguments
class SkillFinder(Resource): parser = reqparse.RequestParser() parser.add_argument('skill', type=str, required=True, help="This field cannot be blank") def post(self): print("reaching this stage") data = SkillFinder.parser.parse_args() skill = data['skill'] data_from_file = parser.from_file('example.pdf') list_of_words = data_from_file['content'].split() count_for_skillset = 0 for word in list_of_words: if word == skill: count_for_skillset = count_for_skillset + 1 print(count_for_skillset)
def _build_parser(): parser = ArgumentParser() parser.add_argument('--file', type=str, dest='filepath', help='File to parse', required=True) parser.add_argument('--language', type=str, dest='language', help='Language of file. Default: English', required=False, default=DEFAULT_LANG) parser.add_argument('--outpath', type=str, dest='outpath', help='Name of output file', required=False, default=DEFAULT_OUTFILE) return parser
bulk(client, docs) def main(args): docs = Docs(args.databasename, args.server, args.username, args.password, args.driver, args.doc_json, args.index_mapping, args.index_name) docs.gen_docs(args.sql_script) docs.indexing_files() if __name__ == '__main__': parser = argparse.ArgumentParser( description='Creating elasticsearch documents and indexing them') parser.add_argument('--databasename', default="Buddie-Search-Testing", help='databasename') parser.add_argument('--server', default='ontwikkel-db.database.windows.net', help='azure sql server server') parser.add_argument('--username', default="username", help='azure sql server username') parser.add_argument('--password', default='....', help='azure sql server password') parser.add_argument('--driver', default='{ODBC Driver 17 for SQL Server}', help='driver for azure sql server') parser.add_argument('--doc_json', default='docs.jsonl',
raw = parser.from_file(path) #return raw["metadata"] return raw["content"] def read_pdf(path, engine="pdfminer"): # type: (str, str) -> str try: func = { "pdfminer": _read_pdf_pdfminer, "tika": _read_pdf_tika, }[engine] except KeyError: raise ValueError("Engine {} doesn't exist".format(engine)) return func(path) if __name__ == "__main__": from argparse import ArgumentParser parser = ArgumentParser( description="Merge pdf files in directory into one file.") parser.add_argument("dir", help="input directory") parser.add_argument("out", help="output file path") args = parser.parse_args() join_pdfs_in_folder(args.dir, args.out)
def main(url): # Download data file=project0.fetchincidents(url) # Extract Data incidents = project0.extractincidents(file) # Create Dataase db = project0.createdb() # Insert Data project0.populatedb(incidents) # Print Status project0.status() if __name__ == '__main__': #Initialize parser to parse URL into main function: parser = argparse.ArgumentParser() #Specifying start of string which should be parsed into the .py function parser.add_argument("--incidents", type=str, required=True, help="The arrest summary url.") #Check to see if url contains the fields necessary for the program: args = parser.parse_args() if args.incidents: main(args.incidents)
def _parse_args(): parser = argparse.ArgumentParser(description='Download privacy policies, optionally update the DB') parser.add_argument('input_path', help='Path to file where policy urls are located.') parser.add_argument('output_dir', help='Path to directory where policies will be saved. Creates directory structure <outputdir>/<date>/<regiontag>/<domain>/<urlhash>/') parser.add_argument('--processes', '-p', default=multiprocessing.cpu_count(), type=int, help='Number of processes to use') parser.add_argument('--check_previous', '-c', default=False, action='store_true', help='Boolean indicating whether to check against previous policies') parser.add_argument('--language', '-l', default='en-US, en', help='Language string to set in Firefox\'s intl.accept_languages option. Defaults to "en_US, en"') parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose logging') return parser.parse_args()
f = codecs.open(os.path.join(outPath, file[:-4] + '.txt'), 'w', encoding='utf8', errors='ignore') f.write(parsed['content']) f.close() if __name__ == '__main__': try: import argparse parser = argparse.ArgumentParser() parser.add_argument( '-config', help='If the parameters is extracted from config file. ' 'If True, then the command line parameters will be bypassed. ' 'If False, then user needs to pass parameters from command line.', type=bool, default=True) parser.add_argument( '-path', help='The path to the original document files folder.') parser.add_argument( '-o', '--output', help='The path to the extracted text output folder') args = parser.parse_args() except: args = None print('No arguments! using default path:') main(args)
def main(): import argparse parser = argparse.ArgumentParser( description="""Extrae y guarda texto y metadata de archivos.""") parser.add_argument("dirin", help="Directorio de archivos originales.") parser.add_argument("dirout", help="Directorio para almacenar texto extraido.") parser.add_argument( "--recursivo", default=False, action="store_true", help= "Visitar subdirectorios si se incluye. (%(default)s) Ej: --recursivo", ) parser.add_argument( "--exts", action="append", required=False, help="Extraer solo de este tipo de archivo. Ej: --exts pdf --exts docx", ) parser.add_argument( "--basura", action="append", help="Eliminar estos caracteres. Ej: --basura '<>!#' --basura � ", ) parser.add_argument( "--chars", default=0, type=int, help= "Eliminar texto con pocos caracteres. (%(default)s). Ej: --chars 10", ) args = parser.parse_args() dirin = args.dirin dirout = Path(args.dirout).resolve() recursivo = args.recursivo exts = args.exts basura = args.basura chars = args.chars n = extraer_todos(dirin, dirout, recursivo=recursivo, exts=exts, basura=basura, chars=chars) print(f"{n} nuevos archivos guardados en carpeta {str(dirout)}")
paragraphs = re.split('\n{2,}', text) paragraphs = [ re.sub('[\n]', '', x) for x in paragraphs if re.search('[ก-์]{10}', x) ] sentences = [x for p in paragraphs for x in pythainlp.sent_tokenize(p)] output_file = os.path.splitext(f)[0] + '.sent' with open(output_file, mode='w') as out: for s in sentences: #out.write(' '.join(pythainlp.tokenize.word_tokenize(s))) out.write(s) out.write('\n') if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--en_dir', type=str) parser.add_argument('--th_dir', type=str) args = parser.parse_args() pdf_ens = glob.glob(f'{args.en_dir}*.pdf') pdf_ths = glob.glob(f'{args.th_dir}*.pdf') print( f'There are {len(pdf_ens)} en documents and {len(pdf_ths)} th documents' ) #pdf2text for pdf_th in tqdm.tqdm(pdf_ths): pdf2text_th(pdf_th) for pdf_en in tqdm.tqdm(pdf_ens): pdf2text_en(pdf_en)
def initArgparse() -> ArgumentParser: parser = ArgumentParser( description= "A directory tree metadata parser using Apache Tika, by default it runs arguments: -d, -f, -m, -s", ) parser.add_argument( "-v", "--version", action="version", version=f"{parser.prog} version {VERSION}", ) parser.add_argument("DIRECTORY", type=Path, default=".", nargs="+", help="directory(s) to parse") parser.add_argument("-d", "--directorytree", action="store_true", help="create directory tree") parser.add_argument( "-e", "--exclude", nargs="+", help="directory(s) to exclude, includes subdirectories", ) parser.add_argument("-f", "--filetree", action="store_true", help="creates a json and csv file tree") parser.add_argument("-m", "--metadata", action="store_true", help="parse metadata") parser.add_argument( "-nm", "--newmetadata", action="store_true", help="create individual metadata files in a 'tikatree' directory", ) parser.add_argument("-s", "--sfv", action="store_true", help="create sfv file") parser.add_argument("-y", "--yes", action="store_true", help="automatically overwrite older files") return parser
#!/usr/bin/env python # -*- coding: utf-8 -*- import os import re import tika from pyltp import Segmentor from tika import parser import argparse import Levenshtein parser = argparse.ArgumentParser( description="Process the patient records in outpaitent service") parser.add_argument("-d", "--data", type=str, required=True, help="Specify the data directory") parser.add_argument("-f", "--feature", type=str, required=True, help="Specify the important keys needed extracted") parser.add_argument("-o", "--output", type=str, required=True, help="Specify the results directory") parser.add_argument("-dict", "--dictionary", type=str, required=True,
def generate_text(markov_chain, words): state = get_random_state(markov_chain) text = state.split()[:words] while len(text) < words: state = get_next_state(markov_chain, state) if state is None: state = get_random_state(markov_chain) text.append(state.split()[-1]) return ' '.join(text) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Markov Chain Text Generator') parser.add_argument('-f', '--file', required=True, help='Name of file to read text from.') parser.add_argument('-o', '--order', default=1, type=int, help='Number of past states each state depends on.') parser.add_argument('-w', '--words', default=100, type=int, help='Number of words to generate.') pargs = parser.parse_args() tokens = tokenise_text_file(pargs.file) markov_chain = create_markov_chain(tokens, order=pargs.order)
regexp = r'(https?://(?:dx\.)?doi\.org/[0-9]{2}\.[0-9]{4,6}/\S*)' elif args.what == 'url' or args.what == 'urls': regexp = r'(https?://\S*)' else: raise ValueError( 'Unrecognised value of the `what` argument: {}'.format(args.what)) matches = re.findall(regexp, raw_text['content']) # return the harvest, one entry per line matches = list(set(matches)) print('\n'.join(matches)) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Manage bib files.') parser.add_argument('--bibfile', type=str, default='') subparsers = parser.add_subparsers() # ---- parser for add reference command parser_add = subparsers.add_parser('add', help='Add reference to bibliography.') parser_add.add_argument('what', type=str) parser_add.add_argument('ids', nargs='*') parser_add.set_defaults(action=_add_reference) # ---- parser for print command parser_print = subparsers.add_parser( 'print', help='Print to terminal the bibtex entry.') parser_print.add_argument('what', nargs='*') parser_print.add_argument('--where', type=str, default='all') # can be: doi, arxiv, all. parser_print.set_defaults(action=_print_reference) # parser_print.add_argument('--action', type=str, default='print')