def computeScores(inputDir, outCSV, acceptTypes): with open(outCSV, "wb") as outF: a = csv.writer(outF, delimiter=',') a.writerow(["x-coordinate","y-coordinate","Similarity_score"]) files_tuple = itertools.combinations(filterFiles(inputDir, acceptTypes), 2) for file1, file2 in files_tuple: try: row_cosine_distance = [file1, file2] file1_parsedData = parser.from_file(file1) file2_parsedData = parser.from_file(file2) v1 = Vector(file1, ast.literal_eval(file1_parsedData["content"])) v2 = Vector(file2, ast.literal_eval(file2_parsedData["content"])) row_cosine_distance.append(v1.cosTheta(v2)) a.writerow(row_cosine_distance) except ConnectionError: sleep(1) except KeyError: continue except Exception, e: pass
def run_exit_tool_on_known_type(dir_list): file_list = get_file_list(dir_list) for entry in file_list: parser.from_file(entry) return
def computeScores(inputDir, outCSV, acceptTypes, allKeys): na_metadata = ["resourceName"] with open(outCSV, "wb") as outF: a = csv.writer(outF, delimiter=',') a.writerow(["x-coordinate","y-coordinate","Similarity_score"]) filename_list = [] for root, dirnames, files in os.walk(inputDir): dirnames[:] = [d for d in dirnames if not d.startswith('.')] for filename in files: if not filename.startswith('.'): filename_list.append(os.path.join(root, filename)) filename_list = [filename for filename in filename_list if parser.from_file(filename)] if acceptTypes: filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes] else: print "Accepting all MIME Types....." files_tuple = itertools.combinations(filename_list, 2) for file1, file2 in files_tuple: row_edit_distance = [file1, file2] file1_parsedData = parser.from_file(file1) file2_parsedData = parser.from_file(file2) intersect_features = set(file1_parsedData["metadata"].keys()) & set(file2_parsedData["metadata"].keys()) intersect_features = [feature for feature in intersect_features if feature not in na_metadata ] file_edit_distance = 0.0 for feature in intersect_features: file1_feature_value = stringify(file1_parsedData["metadata"][feature]) file2_feature_value = stringify(file2_parsedData["metadata"][feature]) feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value)) file_edit_distance += feature_distance if allKeys: file1_only_features = set(file1_parsedData["metadata"].keys()) - set(intersect_features) file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata] file2_only_features = set(file2_parsedData["metadata"].keys()) - set(intersect_features) file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata] file_edit_distance += len(file1_only_features) + len(file2_only_features) file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features)) else: file_edit_distance /= float(len(intersect_features)) #average edit distance row_edit_distance.append(1-file_edit_distance) a.writerow(row_edit_distance)
def command(in_dir, out_dir, tika_server): create_dirs(out_dir) in_files = get_files(in_dir) for fi in in_files: if tika_server: parsed = parser.from_file(fi, tika_server) else: parsed = parser.from_file(fi) out_file = out_file_name(out_dir, fi, 'txt') with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(parsed['content'])
def intersect(json_filename, output_name, index_file, start_index=0, end_index=yaoner.MAX_INT_VALUE): base_directory = '/Users/Frank/Desktop/fulldump/raw-dataset/' if index_file is None: index_file = '/Users/Frank/PycharmProjects/599assignment1/geo-topic-parser-folder/geo-topic-all-files.txt' with open(json_filename) as json_file: json_data = json.load(json_file) concept_dictionary = dict() for key in json_data.keys(): concept_dictionary[key.lower()] = {} file_list = yaoner.read_index_file(index_file, base_directory, start_index, end_index) for idx, val in enumerate(file_list): print(start_index + idx) parsed = parser.from_file(''.join([base_directory, val])) if 'content' in parsed and parsed['content'] is not None: content = parsed['content'] words = content.split() for word in words: lowercased = word.lower() if lowercased in concept_dictionary: last_part = os.path.basename(val) concept_dictionary[lowercased][last_part] = 1 dump(concept_dictionary, output_name + 'from' + str(start_index) + 'to' + str(end_index) + '.json') return
def extract(path): parsed = parser.from_file(path) content = parsed["content"] ners = StanfordExtractor(content).extract() entities = CustomEntityExtractor(content).extract() quantities = QuantityExtractor(content).getQuantities() if len(ners['LOCATION']) > 0: l = GeoTopic(map(lambda l: l['name'], ners['LOCATION'])) geo = l.getInfo() locations = l.getLocations() else: geo = [ ] locations = [ ] return { 'geo' : geo, 'locations' : locations, 'entities': entities['entities'], 'places': ners['LOCATION'], 'dates': ners['DATE'], 'quantities': quantities, 'metadata': parsed['metadata'], 'mime-type': parsed['metadata']['Content-Type'], 'id': idf.set(path) }
def filterFiles(inputDir, acceptTypes): filename_list = [] for root, dirnames, files in os.walk(inputDir): dirnames[:] = [d for d in dirnames if not d.startswith('.')] for filename in files: if not filename.startswith('.'): filename_list.append(os.path.join(root, filename)) filename_list = [filename for filename in filename_list if parser.from_file(filename)] if acceptTypes: filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes] else: print "Accepting all MIME Types....." return filename_list
def file_parser(fname, pages=None): if magic.from_file(fname, mime=True) == 'application/pdf': try: text_array = [] i = 0 d = pdf.Document(fname) for i, p in enumerate(d, start=1): for f in p: for b in f: for l in b: text_array.append(l.text.encode('UTF-8')) if i >= pages: # break after x pages break log.debug("Processed %i pages (%i max)", i, pages) return '\n'.join(text_array) except: # reraise everything raise else: try: content = parser.from_file(fname)['content'] return (content or '').encode('UTF-8') except: # reraise everything raise
def compareValueSimilarity (fileDir, encoding = 'utf-8') : union_feature_names = set() file_parsed_data = {} resemblance_scores = {} file_metadata={} for filename in fileDir: file_parsed = [] parsedData = parser.from_file(filename) file_metadata[filename] = parsedData["metadata"] for key in parsedData["metadata"].keys() : value = parsedData["metadata"].get(key)[0] if isinstance(value, list): value = "" for meta_value in parsedData["metadata"].get(key)[0]: value += meta_value file_parsed.append(str(key.strip(' ').encode(encoding) + ": " + value.strip(' ').encode(encoding))) file_parsed_data[filename] = set(file_parsed) union_feature_names = union_feature_names | set(file_parsed_data[filename]) total_num_features = len(union_feature_names) for filename in file_parsed_data.keys(): overlap = {} overlap = file_parsed_data[filename] & set(union_feature_names) resemblance_scores[filename] = float(len(overlap))/total_num_features sorted_resemblance_scores = sorted(resemblance_scores.items(), key=operator.itemgetter(1), reverse=True) return sorted_resemblance_scores, file_metadata
def run_ner(start_index=0, end_index=MAX_INT_VALUE): index_file = '/Users/Frank/PycharmProjects/599assignment1/geo-topic-parser-folder/geo-topic-all-files.txt' base_directory = '/Users/Frank/Desktop/fulldump/raw-dataset/' file_list = read_index_file(index_file, base_directory, start_index, end_index) measurement_list = [] index = 0 + start_index for entry in file_list: print(index) parsed = parser.from_file(''.join([base_directory, entry])) if 'metadata' in parsed: if 'X-TIKA:EXCEPTION:embedded_exception' in parsed['metadata']: index += 1 continue if 'content' in parsed: if parsed['content'] is not None: # print(json.dumps(parsed['metadata'], indent=4)) # print(parsed['content']) # print('content size ', len(parsed['content'])) if len(parsed['content']) > 1 * 1024 * 1024: index += 1 continue measurements = extract_measurement(parsed['content']) if measurements is not None and len(measurements) > 0: measurement_list.append({entry.split('/')[-1]: measurements}) index += 1 dump_to_json(measurement_list, '/Users/Frank/working-directory/ner-measurement-mentions/', 'from' + str(start_index) + 'to' + str(end_index)) return
def getKeywords(pdfFile,Occur): tikaurl= tika_obo.getTikaAddress() parsed = parser.from_file(pdfFile, tikaurl) metadata = parsed["metadata"] doccontent = parsed["content"] fullwordlist = obo.stripNonAlphaNum(doccontent) wordlist = obo.removeStopwords(fullwordlist, obo.stopwords) dictionary = obo.wordListToFreqDict(wordlist) sorteddict = obo.sortFreqDict(dictionary) count = 0 keywords = [] shortkey = [] maxoccur = Occur for s in sorteddict: numocc = int(s[0]) word = s[1].encode('utf-8') if numocc > maxoccur: keyword = { word : str(numocc) } keywords.append(keyword) if len(word)>6: shortkey.append(word.lower()) count = count + 1 if Occur > 0: return shortkey return keywords
def load_topics(filename): languages.append(language.from_file(filename)) parser_obj = parser.from_file(filename) if 'content' in parser_obj and parser_obj['content']: words.extend(get_nouns(parser_obj['content'])) if 'metadata' in parser_obj: metadata_dict = parser_obj['metadata'] if 'Author' in metadata_dict: if type(metadata_dict['Author']) == type([]): metadata.append(metadata_dict['Author'][0]) else: metadata.append(metadata_dict['Author']) if 'xmp:CreatorTool' in metadata_dict: if type(metadata_dict['xmp:CreatorTool']) == type([]): metadata.extend(metadata_dict['xmp:CreatorTool']) else: metadata.append(metadata_dict['xmp:CreatorTool']) if 'Content-Type' in metadata_dict: if type(metadata_dict['Content-Type']) == type([]): metadata.append(metadata_dict['Content-Type'][0]) else: metadata.append(metadata_dict['Content-Type']) if 'Company' in metadata_dict: if type(metadata_dict['Company']) == type([]): metadata.append(metadata_dict['Company'][0]) else: metadata.append(metadata_dict['Company'])
def search_content(file_path, expressions): """Open a file and search it's contents against a set of RegEx.""" matches = [] count = 0 data = parser.from_file(file_path) # Read into an I/O buffer for better readline support if not data: # There is no content that could be extracted return matches content = io.StringIO(data['content']) # TODO this may create a very large buffer for larger files # We may need to convert this to a while readline() loop for line in content.readlines(): count += 1 # count the number of lines if line: for rex in expressions: # Check if the line matches all the expressions res = rex.regex.search(line) if res: # If there's a match append to the list matches.append(cazobjects.CazRegMatch(res, file_path, count, rex.name)) return matches
def getTikaTags(filename): import tika from tika import parser import obo import tika_obo import gethavens tikaUrl = getTikaAddress() parsed = parser.from_file(filename, tikaUrl) metadata = parsed["metadata"] content = parsed["content"] jsonprops = {'cm:title': str(metadata['resourceName'])} for key in metadata: newkey = str(key) value = str(metadata[key]) jsonprops[newkey] = value title = jsonprops['resourceName'] namebreak = title.split('.') havenrecord = gethavens.getPropertiesHaven(str(jsonprops['resourceName'])) jsonprops['Description'] = 'Ranked:' + str(havenrecord['rank']) \ + ' most secretive Tax Haven\nhttps://www.google.co.uk/maps/place/' \ + havenrecord['country'] jsonprops['Name'] = havenrecord['country'] jsonprops['cmis:title'] = str(title) jsonprops['cmis:author'] = 'admin' return jsonprops
def _request_pdf_data(self, url): parsed = parser.from_file(url) return { 'url': url, 'title': self._parse_pdf_title(parsed), 'body': self._parse_pdf_body(parsed) }
def main(file_name): fi = open("sentences.txt", "w+") fi_summary = open("summary.txt", "w+") fi_cool = open("wtv.txt", "w+") score_sentences = SentenceScores() parsed = parser.from_file(file_name) print parsed["metadata"] content = parsed["content"] content = content.strip() fi_cool.write(content.encode("utf-8")) sentences = content.split(". ") sentences = map(clean_sentence, sentences) lines = score_sentences.get_summary_lines(sentences) max_len = len(lines) / 3 needed_lines = lines[0:max_len] sorted_lines = sorted(needed_lines, key=lambda x: x[0]) for line_num, score in sorted_lines: fi_summary.write((str(line_num+1)+", "+sentences[line_num]).encode("utf-8")) for sentence in sentences: fi.write(sentence.encode("utf-8")) fi.close() fi_summary.close()
def __init__(self, fileName): parsed = parser.from_file(fileName) metadata = parsed["metadata"] # Return re.sub('[\s+]', '', content) # TODO: Delete... Very Redundant.. content = parsed["content"] content = content.replace('\n', '') content = content.replace('\t', '') content = content.replace('\'', '') content = content.replace('\"', '') rx = re.compile('\W+') content = rx.sub(' ', content).strip() self.content = content # Title... try: title = metadata['title'] except: title = 'Untitled' title = title.replace('\t', '') title = title.replace('\t', '') title = title.replace('\'', '') title = title.replace('\"', '') title = rx.sub(' ', title).strip() self.title = title # self.type = self.metadata['Content-Type-Hint'] # self.name = self.metadata['resourceName'] # lanFix = re.sub('[\s+]', '', content) self.lang = language.from_file(fileName)
def makeSearchable(self, src, subdir): rootDir = subdir + "/examplePDFs" pdfPath = rootDir + "/" + "rawPdfs" finishedTextPath = rootDir + "/" + "finishedText" removed_text_path = rootDir + "/" + "removedText" gsPath = rootDir + "/" + "gsPdfs" imagesProcessedPath = rootDir + "/" + "imagesProcessed" imageText = rootDir + "/" + "imageText" if not os.path.exists(pdfPath): os.makedirs(pdfPath) if not os.path.exists(finishedTextPath): os.makedirs(finishedTextPath) if not os.path.exists(removed_text_path): os.makedirs(removed_text_path) if not os.path.exists(gsPath): os.makedirs(gsPath) if not os.path.exists(imagesProcessedPath): os.makedirs(imagesProcessedPath) if not os.path.exists(imageText): os.makedirs(imageText) filename, fileType = src.rsplit(".", 1) print("\n**********************") print("Processing file: " + filename) print("**********************\n") # Extact easy text print("Getting text that can be easily extracted...") rawText = parser.from_file(pdfPath + "/" + src) if rawText["content"] is None: print("Found no text to extract, continuing process") else: fileOutput = open(finishedTextPath + "/" + filename + ".txt", 'w') fileOutput.write(rawText["content"].encode("utf-8")) fileOutput.close() # Remove text from pdf print("Removing text from pdf") process1 = subprocess.Popen(['java', '-jar', 'PdfTextDeleter.jar', src, os.path.join(removed_text_path, src)]) process1.wait() # Apply ghostscript to removed text pdfs if not os.path.exists(gsPath + "/" + filename + "-imgs"): os.makedirs(gsPath + "/" + filename + "-imgs") if not os.path.exists(rootDir + "/imagesProcessed/" + filename + "-imgs"): os.makedirs(rootDir + "/imagesProcessed/" + filename + "-imgs") if not os.path.exists(rootDir + "/imageText/" + filename + "-imgs"): os.makedirs(rootDir + "/imageText/" + filename + "-imgs") print("Converting left over pdf to images") process2 = subprocess.Popen(["gs", "-dNOPAUSE", "-sFONTPATH=/opt/local/share/ghostscript/9.16/Resource/Font/", "-sDEVICE=pngalpha", "-r300", "-dBATCH", "-sOutputFile=" + gsPath + "/" + filename + "-imgs" + "/" + filename + "-%03d" ".png", removed_text_path + "/" + src], env={'PATH': '/opt/local/bin/'}) process2.wait() self.preprocessImages(rootDir, subdir, src) self.applyOCRToImages(rootDir, subdir, src) self.mergeTextFiles(rootDir, subdir, src)
def parse_file(self, path): """ Parses a file at given path :param path: path to file :return: parsed content """ parsed = tkparser.from_file(path) parsed['file'] = os.path.abspath(path) return parsed
def computeScores(inputDir, outCSV, acceptTypes): with open(outCSV, "wb") as outF: a = csv.writer(outF, delimiter=',') a.writerow(["x-coordinate","y-coordinate","Similarity_score"]) files_tuple = itertools.combinations(filterFiles(inputDir, acceptTypes), 2) for file1, file2 in files_tuple: row_cosine_distance = [file1, file2] file1_parsedData = parser.from_file(file1) file2_parsedData = parser.from_file(file2) v1 = Vector(file1_parsedData["metadata"]) v2 = Vector(file2_parsedData["metadata"]) row_cosine_distance.append(v1.cosTheta(v2)) a.writerow(row_cosine_distance)
def test_tika_solr(): s = create_connection(u"Test") file_path = u"testdata/example.pdf" parsed = parser.from_file(file_path) log_parsed(parsed) s.add([parsed], commit=True) return 1, 0
def convert( filepath, output ): parsed = parser.from_file( filepath ) if output: basename, ext_pdf = os.path.splitext( os.path.basename( filepath ) ) output_path = os.path.join( output, basename + '.json' ) else: extensionless_filepath, ext_pdf = os.path.splitext( filepath ) output_path = extensionless_filepath + '.json' with open( output_path, 'wt' ) as textfile: json.dump( parsed, textfile, ensure_ascii=True )
def get_measurements(filename): parser_obj = parser.from_file(filename) if 'content' in parser_obj and parser_obj['content']: return [x for x in regextagger.tag(tokenizer.tokenize(parser_obj['content'])) if x[1] != 'OTHER'] f_read=open(sys.argv[1],'r') given_text=f_read.read(); segmented_lines=nltk.sent_tokenize(given_text) for text in segmented_lines: words=word_tokenize(text) sent = t_gram_tag.tag(words) print given_text(sent)
def extractText(self, params): ''' Using Tika to extract text from given file and return the text content. ''' file_name = params['file_name'] parsed = parser.from_file(file_name) status = IndexUploadedFilesText(file_name, parsed["content"]) if status[0]: return {'job':'text_extraction', 'status': 'successful', 'comment':'Text extracted and indexed to Solr.'} else: return {'job':'text_extraction', 'status': 'unsuccessful', 'comment':status[1]}
def main(): # read the folder name from argument arg_parser = argparse.ArgumentParser(description='Detecting near duplicates using SimHashes') arg_parser.add_argument('-f', '--folder', help='Folder with all the images', required=True) arg_parser.add_argument('-t', '--tika', help='Path to a running tika server', required=True) arg_parser.add_argument('-cb', '--contentbased', help='true/false. Use content in deduplication calculation. Default = false (Must have Tika OCR enabled/ Have Tesseract Installed)', required=False) args = arg_parser.parse_args() SIM_HASH = defaultdict() # read all files for root, dirs, files in os.walk(args.folder): # grab metadata from each file and write it to an output file files = sorted(files) for f in files: path = root + f parsed_data = parser.from_file(path, args.tika) if args.contentbased: if args.contentbased.lower() == 'true': SIM_HASH[f] = get_simhash(parsed_data, True) else: SIM_HASH[f] = get_simhash(parsed_data, False) # make clusters SORTED_HASH = sorted(SIM_HASH.items(), key=operator.itemgetter(1)) DISTANCES = OrderedDict() DISTANCES[SORTED_HASH[0][0]] = None for i in range(1, len(SORTED_HASH)): DISTANCES[SORTED_HASH[i][0]] = simhash.get_hamming_distance(SORTED_HASH[i - 1][1], SORTED_HASH[i][1]) # cluster images together cluster_number = 0 CLUSTERS = defaultdict(list) for key, value in DISTANCES.iteritems(): print key + ": " + str(value) if value is None: CLUSTERS[cluster_number].append(key) else: if value <= THRESHOLD: CLUSTERS[cluster_number].append(key) else: cluster_number += 1 CLUSTERS[cluster_number].append(key) print '*' * 10 + 'CLUSTERS' + '*' * 10 for key, value in CLUSTERS.iteritems(): print 'CLUSTER ' + str(key) + ':' for x in value: print '\t' + x return
def parse_files(file_name): print("parsing file : % \n", file_name) parsed = parser.from_file(file_name) print("meta-data:\n") print(parsed["metadata"]) print("content:\n") content = parsed["content"] c2 = content.encode('utf-8').strip() print(c2) print("\n\n");
def extract_text(request, file_name): ''' Using Tika to extract text from given file and return the text content. ''' if "none" in IndexStatus("text", file_name): parsed = parser.from_file("{0}/{1}/{2}".format(APP_NAME, UPLOADED_FILES_PATH, file_name)) status = IndexUploadedFilesText(file_name, parsed["content"]) if status[0]: return HttpResponse(status=200, content="Text extracted.") else: return HttpResponse(status=400, content="Cannot extract text.") else: return HttpResponse(status=200, content="Loading...")
def scan(filelist, conf=DEFAULTCONF): results = [] for f in filelist: metadata = parser.from_file(f).get('metadata', {}) for field in conf['remove-entry']: if field in metadata: del metadata[field] results.append((f, metadata)) metadata = {} metadata["Name"] = NAME metadata["Type"] = TYPE return results, metadata
def extractParsingInfo(self): FileSizeList = [] # Getting the files whose size would be computed response = MIME_Core().facetQuery('metadata') mimeTypeResponse = response.result.dict['facet_counts']['facet_fields']['metadata'] mimeList = [] for mime_type, count in mimeTypeResponse.iteritems(): if mime_type == 'application/java-archive': continue mimeList.append(mime_type) mime_size_diversity = {} for mime in mimeList: metadata_list = {} print mime[mime.index('/')+1:] query = 'metadata:%s' % (mime) response = MIME_Core().queryAll(query=query, rows = 100) files = response.result.dict['response']['docs'] for file in files: parsed = parser.from_file(file['file'][0]) if 'metadata' in parsed: metadata = parsed['metadata'] for key,value in metadata.iteritems(): if key in mime_size_diversity: mime_size_diversity[key] += 1 else: mime_size_diversity[key] = 1 pass print 'done with ' + mime top_metadata = sorted(mime_size_diversity.items(), key=operator.itemgetter(1), reverse=True) metadata = [] for item in top_metadata[:20]: metadata.append(item[0]) metadata.append(item[1]) pass out_file = open('data/word_cloud/word_cloud.json',"w") json.dump(metadata,out_file, indent=4) pass
def spell_checker(self, page, stopwordsList): driver = self.driver driver.implicitly_wait(2) driver.get(page) self.f.write('--- checking for spelling %s\n' %page) allTextOnPage = parser.from_file(page)['content'].encode('utf-8') allTextOnPage = re.findall('[a-z]+', allTextOnPage.lower()) stopwordsList.extend(stopwords.words('english')) allTextOnPage = [w for w in allTextOnPage if not w in stopwordsList] for word in allTextOnPage: if not wordnet.synsets(word): print 'Is this correct? ', word self.f.write('Is this word correct? %s\n' %word)
print(check[1]) msg_check = True print('msg_check is True') except: msg_check = False print('msg_check is False') return msg_check #looping through the source folder and opening pdf files for filename in os.listdir('./source'): if filename.endswith(".pdf"): print(os.path.join(filename)) new_filename = (os.path.join(filename)) #opening the target PDF reader = parser.from_file('./source/' + filename) pdfText = reader['content'] cal_item = cal_check(pdfText) if cal_item is False: #getting the date date_target = pdfText.split('Sent: ') date_target = date_target[1].split('\n') date_target_final = date_target[0] date_entry = parse(date_target_final, fuzzy_with_tokens=True) #getting the recipients to_target = pdfText.split('To: ') to_target = to_target[1].split('\n') to_entry = to_target[0] try: cc_target = pdfText.split('Cc: ') cc_target = cc_target[1].split('\n')
from tika import parser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser path = "C:\Thameem\Testing\xxx\security-guide.pdf" parsed_txt = parser.from_file(path) text = parsed_txt["content"] print(text) if path.split(".")[1] == "pdf": fp = open(path, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) ToC_list = [i[1] for i in doc.get_outlines()] print(ToC_list) # lst =[i for i in lst if re.search("[A-Za-z]{3,}",i) and re.sub("^[0-9]+","",i).strip() not in Filtered_ToC_list] # print(lst) # processed_txt= " ".join([re.sub("'|\"|\\\\","",x) for x in lst] # text = " ".join([s for s in text.split("\n") if s and]) # print(text)
new_file = list(set(list2).difference(set(list1))) # 如果更新了,new_file为新增文件的list list1 = copy.deepcopy(list2) # list1 更新 for file in new_file: # 依次处理新增的每个文件 print("\n新增文件:", file) extension_name = os.path.splitext(file)[1] # 获取文件后缀对文件进行判断 if extension_name != '.pdf': print("非pdf文件!") continue file_name = os.path.splitext(file)[0] # 获取文件名 print("正在处理:", file_name, "请稍等。。。") pdf_file = spy_folder + '/' + file # pdf_file 为源文件的完整地址 word_file = config['save_folder'] + '/' + file_name + '.docx' # word_file 为文件存储地址 text_file = config['save_folder'] + '/' + file_name + '.txt' # word_file 为文件存储地址 parse_entire_pdf = parser.from_file(pdf_file, xmlContent=True) parse_entire_pdf = parse_entire_pdf['content'] content = html_to_plain_text(parse_entire_pdf) pattern = re.compile(".*" + key_word + ".*", re.I) result = pattern.findall(content) print("共计段数:", len(result)) result_csv = list() result_csv.append("标题:" + file_name) # result_csv.append("作者:" + file_name + "\n") result_text = "标题:" + file_name # result_text += "\n作者:" + file_name + "\n" cnt = 1 for para in result:
{q_num} \numeric off : \end_layout''' get_path = r'C:\Users\roybo\Desktop\University\semester 2' create_folder_path = r'C:\Users\roybo\OneDrive\University' dups_in_create = True subject = None hw_number = None number_quest = None get_lines = 5 assignment_path = None if subject is None or number_quest is None: assignment_path = assignment_path if assignment_path is not None else sys.argv[ 1] assignment_text = parser.from_file(assignment_path)['content'].strip() # try: # assignment_text = textract.process(assignment_path).decode('UTF-8') # except UnicodeDecodeError as e: # print(parser.from_file(assignment_path)) if subject is None: calculated_subject = get_subject(assignment_text) if calculated_subject is None: raise Exception("can't detect subject") else: subject = calculated_subject if number_quest is None: calculated_questions = count_questions(assignment_text, subject) if calculated_questions is None: raise Exception("can't detect question number")
res = pattern.findall(text) res = set([''.join(i) for i in res]) for item in res: if item.startswith('Fig'): figures[page] += 1 if figures[page] == 0: s = set() pattern = re.compile(r'(Figure|Table|Fig\.)\s*(\d+)(\.[A-Z]{0,1})') res = pattern.findall(text) res = set( [''.join(i).replace('.', ':').split(':')[0] for i in res]) - s s = s | res for item in res: if item.startswith('Fig'): figures[page] += 1 return figures, tables, equations if __name__ == "__main__": pdfFileObject = open(pdf, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObject) count = pdfReader.numPages # page-wise feature figures, tables, formulas = parsevec(pdf, pdfReader, count) raw = parser.from_file(pdf) text = str(raw['content']).split('\n') title = word_tokenize(text[0].rstrip('\n')) # paper-wise feature wordf = getWordsf(title, text[1:]) references = countReferences(text)
def get_transactions(self, trans_file): parsed = parser.from_file(trans_file) #print(parsed["content"]) prev = None fund = None last_dt_trans = list() transactions = dict() for l in parsed['content'].splitlines(): if l.strip() == '': continue if 'TransactionDate' in l and prev: fund = prev.replace('TRANSACTION SUMMARY FOR ', '') elif 'Opening Balance' in l: continue elif 'Closing Balance' in l: ldt = get_date_or_none_from_string(l.split(' ')[0], '%d-%b-%Y') for t in last_dt_trans: t['date'] = ldt transactions[fund].append(t) last_dt_trans.clear() fund = None elif fund: tran = dict() description = '' field = 0 for i, token in enumerate(l.split(' ')): if i == 0: dt = get_date_or_none_from_string(token, '%d-%b-%Y') if dt: tran['date'] = dt field = 1 else: if description == '': description = token else: description += ' ' + token field = 1 else: temp = get_float_or_none_from_string(token) if not temp and temp != 0: if description == '': description = token else: description += ' ' + token else: if field == 1: tran['units'] = temp tran['description'] = description elif field == 2: tran['nav'] = temp elif field == 3: tran['trans_amount'] = temp field += 1 if not fund in transactions: transactions[fund] = list() if 'date' in tran: transactions[fund].append(tran) else: last_dt_trans.append(tran) else: print(f'ignore {l}') prev = l return transactions
def computeScores(inputDir, outCSV, acceptTypes, allKeys): na_metadata = ["resourceName"] with open(outCSV, "wb") as outF: a = csv.writer(outF, delimiter=',') a.writerow(["x-coordinate","y-coordinate","Similarity_score"]) filename_list = [] for root, dirnames, files in os.walk(inputDir): dirnames[:] = [d for d in dirnames if not d.startswith('.')] for filename in files: if not filename.startswith('.'): filename_list.append(os.path.join(root, filename)) try: filename_list = [filename for filename in filename_list if "metadata" in parser.from_file(filename)] except ConnectionError: sleep(1) if acceptTypes: filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8').decode('utf-8')).split('/')[-1] in acceptTypes] else: print("Accepting all MIME Types.....") files_tuple = itertools.combinations(filename_list, 2) for file1, file2 in files_tuple: try: row_edit_distance = [file1, file2] file1_parsedData = parser.from_file(file1) file2_parsedData = parser.from_file(file2) intersect_features = set(file1_parsedData["metadata"].keys()) & set(file2_parsedData["metadata"].keys()) intersect_features = [feature for feature in intersect_features if feature not in na_metadata ] file_edit_distance = 0.0 for feature in intersect_features: file1_feature_value = stringify(file1_parsedData["metadata"][feature]) file2_feature_value = stringify(file2_parsedData["metadata"][feature]) if len(file1_feature_value) == 0 and len(file2_feature_value) == 0: feature_distance = 0.0 else: feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value)) file_edit_distance += feature_distance if allKeys: file1_only_features = set(file1_parsedData["metadata"].keys()) - set(intersect_features) file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata] file2_only_features = set(file2_parsedData["metadata"].keys()) - set(intersect_features) file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata] file_edit_distance += len(file1_only_features) + len(file2_only_features) # increment by 1 for each disjunct feature in (A-B) & (B-A), file1_disjunct_feature_value/file1_disjunct_feature_value = 1 file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features)) else: file_edit_distance /= float(len(intersect_features)) #average edit distance row_edit_distance.append(1-file_edit_distance) a.writerow(row_edit_distance) except ConnectionError: sleep(1) except KeyError: continue
import q import re import traceback if False: " JVM and tika " import tika tika.initVM() from tika import parser raw = parser.from_file('./data/20190417181644xqqs.pdf', xmlContent=True) print(raw["metadata"]) print(raw['content']) q.d() open("aaaa.html", "w").write(raw['content']) if True: """ https://github.com/pikepdf/pikepdf/ 操作对象是 整个 pdf 的页面 不支持中文,没找到 encoding 设置 文档 JUST LIKE SHIT """ import pikepdf # Elegant, Pythonic API # with pikepdf.open('./data/Paperid975.pdf') as pdf:
def analysis(): directory = os.fsencode(directory_path) bow_df = pd.DataFrame(columns=headings_bow) # Create empty table for bow report = docx.Document() # Create report document report.add_heading(f'Analysis {os.path.basename(directory_path)}', 0) # Add title to report for file in os.listdir(directory): document_path = os.path.join(directory, file).decode() document = parser.from_file(document_path) # Retrieve text from file document = document['content'] content = sub(r'http\S+', " ", document) # Delete all links content = sub( "[^a-zA-Z0-9|^-]", " ", content).lower() # Delete all punctuation/upper case letters content_words = tokenize.word_tokenize( content) # Split words into list language_name = language[classify(content)[0]] # Detect text language content_words_core = " ".join( filter(lambda x: x in keywords, content_words)).split( ) # Delete all words except for words in keywords vector = vectorize(content_words_core) # Count occurrence of keywords filename_first = os.fsdecode(file)[ 0:3] # Select first 3 characters of filename vector.insert( 0, language_name.capitalize()) # Add language to vector-list vector.insert(0, filename_first ) # Add first 3 characters of filename to vector-list bow = pd.DataFrame( vector).transpose() # Put vector-list into table and transpose bow.columns = headings_bow # Add headings to table bow_df = pd.concat([bow_df, bow]) # Add table to table of all files bow_df[keywords] = bow_df[keywords].astype( 'int64') # Change datatype in table to integer bow_df.loc[:, 'Total'] = bow_df.sum(numeric_only=True, axis=1) # Add totals column bow_df.sort_values( by=['Total'], inplace=True, ascending=False) # Sort table on descending total column table_bow = report.add_table(bow_df.shape[0] + 1, bow_df.shape[1]) # Add template table for j in range(bow_df.shape[-1]): table_bow.cell(0, j).text = bow_df.columns[j] # Add headers to table for i in range(bow_df.shape[0]): for j in range(bow_df.shape[-1]): table_bow.cell(i + 1, j).text = str(bow_df.values[i, j]) # Add data to table table_bow.style = 'Light Shading' # Change style of table for file in os.listdir(directory): document_path = os.path.join(directory, file).decode() document = parser.from_file(document_path) # Retrieve text from file document = document['content'] content = sub(r'http\S+', " ", document) # Delete all links content = sub("[^a-zA-Z|^-]", " ", content).lower( ) # Delete all punctuation/upper case letters/numbers content_words = [ w for w in content.split() if len(w) > 1 ] # Delete all words with one letter and split words into list language_name = language[classify(content)[0]] # Detect text language content_words_core = [ w for w in content_words if w not in stopwords.words(language_name) ] # Delete adverbs stemmed_words = [ SnowballStemmer(language_name).stem(word) for word in content_words_core ] # Group different forms of a word to a single item fdist1 = FreqDist(stemmed_words) # Count occurrence of words top_10_words = pd.DataFrame(fdist1.most_common(10), columns=['Word', 'Count' ]) # Put top 10 words in table filename = os.fsdecode(file) # Retrieve filename report.add_heading(filename, level=1) # Add subtitle per document report.add_paragraph( f'Language: {language_name.capitalize()}') # Add language table = report.add_table(top_10_words.shape[0] + 1, top_10_words.shape[1]) # Add template table for j in range(top_10_words.shape[-1]): table.cell( 0, j).text = top_10_words.columns[j] # Add headers to table for i in range(top_10_words.shape[0]): for j in range(top_10_words.shape[-1]): table.cell(i + 1, j).text = str( top_10_words.values[i, j]) # Add data to table table.style = 'Light Shading' # Change style of table report.save(f'{os.environ["USERPROFILE"]}/Desktop/report.docx' ) # Save document to desktop
import tika from tika import parser from os import listdir from os.path import join, isfile # mielőtt a szkriptet indítod, el kell indítani a Tika-t # java -jar tika-app-1.18.jar -s tika.initVM() in_path = 'data/raw' out_path = 'data/interim' raw_texts = [f for f in listdir(in_path) if isfile(join(in_path, f))] for raw_text in raw_texts: parsed = parser.from_file(join(in_path, raw_text)) plain_text = parsed['content'].strip() fname = raw_text.split('.')[0] + '.txt' with open(join(out_path, fname), 'w') as f: f.write(plain_text)
from io import StringIO from bs4 import BeautifulSoup from tika import parser import re import json movePdf = [] data = parser.from_file( 'd:/dokumenter/pokemon/Homebrew_Rules_for_Abilities_and_Moves.pdf', xmlContent=True) xhtml_data = BeautifulSoup(data['content'], "html.parser") for page, content in enumerate( xhtml_data.find_all('div', attrs={'class': 'page'})): if 145 > page >= 50: _buffer = StringIO() _buffer.write(str(content)) parsed_content = parser.from_buffer(_buffer.getvalue()) _buffer.truncate() _buffer.close() movePdf.append({ 'id': 'page_' + str(page + 1), 'content': parsed_content['content'] }) movesDict = {} translateKeyDict = { 'Move': 'Name', 'Class': 'DType', 'Frequency': 'Freq', 'Effect': 'Effects',
from tika import parser raw = parser.from_file( 'C:\\Users\\InnSight\Documents\\Github\\Gloomhaven-Deck-Builder\\ghclass\\BR\\BR Cards.pdf' ) content = raw['content'].split('Brute\n') n = 5 list(filter(None, content[17].split('\n')))
def convert_pdf_to_txt(path): raw = parser.from_file(path) text = (raw['content']) print(text)
def download_doronavirus_Data(self): # Chromeを起動する driver = webdriver.Chrome(self.DRIVER_PATH) # リンク名の定義 link_name1 = u'国内事例における都道府県別の患者報告数' link_name2 = u'国内における都道府県別の' # 取得する日付の配列を作成(yyyy-MM-dd HH:mm:ss.SSSSSSSSS) dt_now = datetime.datetime.now() # 半角対応用(令和2年5月1日版) date_list = [] # 全角(月)対応用1(令和2年5月1日版) date_list_halfwidth1 = [] # 全角(日)対応用2(令和2年5月1日版) date_list_halfwidth2 = [] # 全角対応用(令和2年5月1日版) date_list_fullwidth = [] # yyyymmdd型テキスト名保存用 filename_list = [] i = 0 while (1): dt_now_day = dt_now.day - i date_list.insert( 0, '令和2年' + str(dt_now.month) + '月' + str(dt_now_day) + '日版') date_list_halfwidth1.insert( 0, '令和2年' + str(dt_now.month).translate( str.maketrans( {chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) + '月' + str(dt_now_day) + '日版') date_list_halfwidth2.insert( 0, '令和2年' + str(dt_now.month) + '月' + str(dt_now_day).translate( str.maketrans( {chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) + '日版') date_list_fullwidth.insert( 0, '令和2年' + str(dt_now.month).translate( str.maketrans( {chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) + '月' + str(dt_now_day).translate( str.maketrans( {chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) + '日版') dt_now_yyyymmdd = dt_now + datetime.timedelta(days=-i) filename_list.insert(0, dt_now_yyyymmdd.strftime('%Y%m%d')) print(date_list) print(date_list_halfwidth1) print(date_list_halfwidth2) print(date_list_fullwidth) print(filename_list) if str(i + 1) == str(dt_now.day): break i += 1 # カウント変数 i = 0 # 取得する日付数だけループ for data in date_list: # URLを開く # Googleのページを開く --- (*2) driver.get( 'https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000121431_00086.html' ) # ページが開くまで待つ --- (*3) time.sleep(1) try: # ある日付のコロナウイルス感染症の現状をダウンロード(半角対応用(令和2年5月1日版)) driver.find_element_by_partial_link_text(data).click() except Exception: # 2秒待機 time.sleep(2) try: # 全角(月)対応用1(令和2年5月1日版)に変換して再検索 driver.find_element_by_partial_link_text( date_list_halfwidth1[i]).click() except Exception: try: # 全角(日)対応用2(令和2年5月1日版)に変換して再検索 driver.find_element_by_partial_link_text( date_list_halfwidth2[i]).click() except Exception: try: # 全角対応用(令和2年5月1日版)に変換して再検索 driver.find_element_by_partial_link_text( date_list_fullwidth[i]).click() except Exception: print(data + 'は見つかりませんでした。') continue # 2秒待機 time.sleep(2) # pdfファイル先に遷移する try: driver.find_element_by_partial_link_text(link_name1).click() except Exception: driver.find_element_by_partial_link_text(link_name2).click() # 5秒待機 time.sleep(5) # pdfのurlを取得する cur_url = driver.current_url # pdfをtxtに変換する file_data = parser.from_file(cur_url) text = file_data["content"] # print(text) text_path = self.FILE_PATH + self.SEPALATE + 'result_' + data + '.txt' file = open(text_path, 'w') file.write(text) file.close() # textを編集する # 都道府県データを作成する DATA = """ 北海道 青森県 岩手県 宮城県 秋田県 山形県 福島県 茨城県 栃木県 群馬県 埼玉県 千葉県 東京都 神奈川県 新潟県 富山県 石川県 福井県 山梨県 長野県 岐阜県 静岡県 愛知県 三重県 滋賀県 京都府 大阪府 兵庫県 奈良県 和歌山県 鳥取県 島根県 岡山県 広島県 山口県 徳島県 香川県 愛媛県 高知県 福岡県 佐賀県 長崎県 熊本県 大分県 宮崎県 鹿児島県 沖縄県 """ #textの空白行と行を除く path = text_path output = '' with open(path) as f: for s_line in f: pre = re.sub('^([^\s]*).*', '\\1', s_line) if pre in DATA and re.sub('\s', '', pre) != '': s_line2 = re.sub( '^([^\s]*)\s([^\s]*)\s([^\s]*)\s([^\s]*).*', '\\1,\\2,\\3,\\4', s_line) str2 = re.sub('.*,(.*),.*,.*', '\\1', s_line2) str2 = re.sub('\r|\n', '', str2) if str2.isdigit(): s_line3 = re.sub('(.*),(.*),(.*),.*', '\\1,\\2,\\3', s_line2) else: s_line3 = re.sub('(.*),.*,(.*),(.*)', '\\1,\\2,\\3', s_line2) output = output + s_line3 else: pre = re.sub('^[^\s]*\s([^\s]*).*', '\\1', s_line) if pre in DATA and re.sub('\s', '', pre) != '': s_line2 = re.sub( '^[^\s]*\s([^\s]*)\s([^\s]*)\s([^\s]*)\s[^\s]*.*', '\\1,\\2,\\3', s_line) output = output + s_line2 file = open(self.FILE_PATH + self.SEPALATE + filename_list[i] + '.txt', 'w', encoding="utf-8") file.write(output) f.close() file.close() os.remove(path) print(data + 'は見つかりました。') i += 1 # 2秒待機 time.sleep(2) # driverを閉じる driver.close() driver.quit()
#Author: Dhivyabharathi Ramasamy #This script extracts text from pdf using tika. Set path as required. # Import statements from urllib.request import urlopen from tika import parser import requests #Set path pdf_path = os.getcwd() + '/data/pdf/' extract_path = os.getcwd() + '/data/extracted_files/' # Extract pdf to text files = glob(pdf_path + '*.pdf') print("files found ", len(files)) count = 0 for file in files: filename = file.split(pdf_path)[1] try: raw = parser.from_file(pdf_path + filename) with open(extract_path + filename, 'w+') as f: f.write(raw['content']) count += 1 except: print("unable to parse ", filename) print("files extracted ", count) print('Done')
def get_metadata(self): parsed = parser.from_file(self.get_file(self.url)) return parsed["metadata"]
#!/usr/bin/env python from tika import parser parsed = parser.from_file('financials.pdf') with open('financials_metadata.txt', 'w') as file: file.write(str(parsed["metadata"])) with open('financials_content.txt', 'w') as file: file.write(parsed["content"])
def get_text(self): parsed = parser.from_file(self.get_file(self.url)) return parsed["content"]
import tika tika.initVM() from tika import parser parsed = parser.from_file('/path/to/file') #print parsed["metadata"] print parsed["content"]
def pdftotext_converter(filename): raw = parser.from_file(filename) return(raw['content']) #returns a string containing the plain text of a PDF file
def insert_pdf(args): buf = StringIO() with redirect_stdout(buf), redirect_stderr(buf): pdf_path, engine_string_, engine2_string_ = args pdf_path = Path(pdf_path) engine_ = create_engine(engine_string_) engine2_ = create_engine(engine2_string_) def get_number_of_pages(): with pdf_path.open("rb") as pdf: reader = PyPDF2.PdfFileReader(pdf) if reader.isEncrypted: reader.decrypt("") total_pages = reader.getNumPages() return total_pages def check_if_file_is_in_db_already(): with engine_.connect() as conn_: stmt = text( "SELECT * FROM pcmr.pdfs WHERE pdfName = :pdf_name;") result_ = conn_.execute(stmt, {"pdf_name": pdf_path.stem}) return True if result_.rowcount > 0 else False # noinspection SqlResolve def get_pdf_metadata(): stmt = text( "SELECT ParentID, DataID, CreateDate FROM Regulatory_Untrusted._RegDocs.DTreeCore " "WHERE Name LIKE :file_name;") with engine2_.connect() as conn_: df = pd.read_sql(stmt, conn_, params={"file_name": pdf_path.stem + "%"}) return df.to_dict("records")[0] try: if check_if_file_is_in_db_already(): return metadata = get_pdf_metadata() metadata["pdf_name"] = pdf_path.stem metadata["pdf_size"] = int( pdf_path.stat().st_size / 1024 / 1024 * 100) / 100 metadata["total_pages"] = get_number_of_pages() metadata["xmlContent"] = parser.from_file( str(pdf_path), xmlContent=True)["content"] csv_data = get_additional_data(pdf_path.stem) metadata["company"] = csv_data["company"] metadata["submitter"] = csv_data["submitter"] metadata["application_id"] = csv_data["application_id"] with engine_.connect() as conn: statement = text( "INSERT INTO pdfs (pdfId, pdfName, pdfSize, filingId, date, totalPages, xmlContent," "company, submitter, application_id, status) " + "VALUE (:DataID,:pdf_name,:pdf_size,:ParentID,:CreateDate,:total_pages, :xmlContent, " ":company, :submitter, :application_id, '');") result = conn.execute(statement, metadata) print( f"{pdf_path.stem}: successfully inserted {result.rowcount} rows" ) except Exception as e: print(f"{pdf_path.stem}: ERROR! {e}") traceback.print_tb(e.__traceback__) finally: return buf.getvalue()
def pdf_to_text(pdf_file_name): content = parser.from_file(pdf_file_name) return content['content']
def main(pdfdir, textdir): dirlist = [fn for fn in os.listdir(pdfdir) if fn.endswith('.pdf')] print 'Extracting text, using Tika, from %d files in %s.' % \ (len(dirlist), pdfdir) print ' Writing output text files to %s.' % textdir if not os.path.exists(textdir): os.mkdir(textdir) widgets = [ 'Files (of %d): ' % len(dirlist), Percentage(), ' ', Bar('='), ' ', ETA() ] pbar = ProgressBar(widgets=widgets, maxval=len(dirlist)).start() for (i, fn) in enumerate(dirlist): pbar.update(i) #if int(fn.split('.')[0]) != 1001: # continue #print fn parsed = parser.from_file(pdfdir + '/' + fn) try: if parsed['content'] == None: print 'Tika found no content in %s.' % fn import pdb pdb.set_trace() continue except: print 'Tika could not parse %s.' % fn continue with io.open(textdir + '/' + fn[0:-4] + '.txt', 'w', encoding='utf8') as outf: cleaned = parsed['content'] # Translate some UTF-8 punctuation to ASCII punc = { 0x2018: 0x27, 0x2019: 0x27, # single quote 0x201C: 0x22, 0x201D: 0x22, # double quote 0x2010: 0x2d, 0x2011: 0x2d, 0x2012: 0x2d, 0x2013: 0x2d, # hyphens 0xF0B0: 0xb0, # degree 0xFF0C: 0x2c, # comma 0x00A0: 0x20, # space 0x2219: 0x2e, 0x2022: 0x2e, # bullets } # 0x005E:0x5e, 0x02C6:0x5e, 0x0302:0x5e, 0x2038:0x5e, # carets # 0x00B0:0x6f, 0x02DA:0x6f, # degree # 0x00B9:0x31, 0x00B2:0x32, 0x00B3:0x33, # exponents cleaned = cleaned.translate(punc) # Replace newlines that separate words with a space (unless hyphen) cleaned = re.sub(r'([^\s-])[\r|\n]+([^\s])', '\\1 \\2', cleaned) # Remove hyphenation at the end of lines # (this is sometimes bad, as with "Fe-\nrich") cleaned = cleaned.replace('-\n', '\n') # Remove all newlines cleaned = re.sub(r'[\r|\n]+', '', cleaned) # Remove xxxx.PDF cleaned = re.sub(r'([0-9][0-9][0-9][0-9].PDF)', '', cleaned, flags=re.IGNORECASE) # And "xx(th|st) Lunar and Planetary Science Conference ((19|20)xx)" # with optional parentheses, optional LPI contrib cleaned = re.sub( r'([0-9][0-9].. Lunar and Planetary Science Conference \(?(19|20)[0-9][0-9]\)?)( \(LPI Contrib. No. [0-9][0-9][0-9][0-9]\))? ?', '', cleaned, flags=re.IGNORECASE) # And "Lunar and Planetary Science XXXIII (2002)" # with Roman numeral and optional year cleaned = re.sub( r'(Lunar and Planetary Science [CDILVXM]+( \((19|20)[0-9][0-9]\))?) ?', '', cleaned, flags=re.IGNORECASE) # Remove mailto: links cleaned = re.sub(r'mailto:[^\s]+', '', cleaned) outf.write(cleaned) outf.close()
def pdf_parser(path): raw = parser.from_file(path) return raw['content']
def extract_information(pdf_path): raw_text = parser.from_file(pdf_path) raw_list = raw_text['content'].splitlines() text = " ".join(raw_list) return text
def convert_to_text(file_name): logger.info("Converting file to text") parsed = parser.from_file(file_name) full_text = parsed["content"] return full_text
def text_extract(self): parsed_file = parser.from_file(self.file_path) self.meta_data = parsed_file['metadata'] parsed_content = parsed_file['content'] return parsed_content
pdfWriter = PyPDF2.PdfFileWriter() for files_address in pdf_files: pdfFileObj = open(files_address, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) for pageNum in range(0, pdfReader.numPages): pageObj = pdfReader.getPage(pageNum) pdfWriter.addPage(pageObj) pdfOutput = open('merged.pdf', 'wb') pdfWriter.write(pdfOutput) pdfOutput.close() raw = parser.from_file("/Users/docha/PycharmProjects/Tools_for_buh/merged.pdf") raw = raw['content'] special_char_map = { ord('ä'): 'a', ord('ü'): 'u', ord('ö'): 'o', ord('õ'): 'o', ord('ž'): 'z', ord('š'): 's', ord('Ä'): 'A', ord('Ü'): 'U', ord('Ö'): 'O', ord('Õ'): 'O', ord('Z'): 'Z', ord('Š'): 's'
import re import pandas as pd from tika import parser raw = parser.from_file('KW18Abstracts.pdf') raw_text = raw['content'][3580:12120360] def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] # split by regex for abstract names s = r'((TH|FR|SA)?-?(OR|PO|PUB)\d{3,4})\s' split_abstracts = re.split(s, raw_text) # Above regex splits into 4 sections by matching groups # poster_id # day # poster_type # poster_content abstract_list = [sections for sections in chunks(split_abstracts[1:], 4)] abstract_df = pd.DataFrame( abstract_list, columns=['poster_id', 'day', 'poster_type', 'poster_content']) def section_split(text):
import os from tika import parser savePath = "txt" if not os.path.exists(savePath): os.mkdir(savePath) folder_name = "arxiv" file_list = [name for name in os.listdir(folder_name)] file_list = [folder_name + '/' + str(name) for name in file_list] i = 1 for file in file_list: raw = parser.from_file(file) text = raw['content'] text = str(text) l1 = text.split('\n') l2 = [] for line in l1: if len(line) > 10: l2.append(line) dpath = os.path.join(savePath, str(i) + ".txt") with open(dpath, "w", encoding='utf-8') as code: for line in l2: code.write(str(line)) code.write('\n') i += 1