def parse(): ''' Парсинг docx ''' i = [] if check_found_file(TMP + '/2.docx'): doc = docx2txt.process(TMP + '/2.docx') else: print('Отсутствует файл docx!!!') for line in doc.splitlines(): if line == '': continue elif line[:4] == 'Фото': photolist = line[5:].split(', ') photos = [] for photo in photolist: photos.append( '/files/img/' + str(datetime.date.today()) + '/SMALL' + photo.strip() + '.JPG' ) i.append({'photo': photos, 'size': len(photolist)}) else: i.append({'paragraph': line.rstrip()}) return i
def execute(info_footnotes): info=[] rd.open_location("/DOCX",True) for filename in os.listdir(os.getcwd()): if filename.endswith(".docx"): headings_list,pages=get_headings(filename) rd.open_location("/DOCX",False) raw_text=docx2txt.process(filename) raw_text_lower=raw_text.lower() TOC=has_TOC(headings_list,raw_text_lower) iainfo=gi.execute(raw_text_lower) links=combine(raw_text,filename,info_footnotes) if headings_list==None: back=get_references2(raw_text_lower) me=find_m(raw_text_lower) if back[0]=="no list": info.append([filename,back[0],me,pages,iainfo,links[0],links[1],links[2],TOC]) else: info.append([filename,back[0],me,pages,iainfo,links[0],links[1],links[2],TOC]) else: references=get_references(headings_list,raw_text) monitoring_and_evaluation=get_monitoring_and_evaluation(headings_list,raw_text) info.append([filename,references,monitoring_and_evaluation,pages,iainfo,links[0],links[1],links[2],TOC]) return info
def process_narratives(): narrative_outputs = {} narrative_list = os.listdir('narratives') for narrative in narrative_list: if not narrative == '.DS_Store': #don't try and read the hidden files text = docx2txt.process('narratives'+narrative) tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
def check_regexs(self, regexs, search_extensions, enable_pdf): """Checks the file for matching regular expressions: if a ZIP then each file in the ZIP (recursively) or the text in a document""" if self.type == 'ZIP': try: if get_ext(self.path) == '.docx': doctext = docx2txt.process(self.path) self.check_text_regexs(doctext, regexs, '') if zipfile.is_zipfile(self.path): zf = zipfile.ZipFile(self.path) self.check_zip_regexs(zf, regexs, search_extensions, enable_pdf, '') else: self.set_error('Invalid ZIP file') except IOError: self.set_error(sys.exc_info()[1]) except: self.set_error(sys.exc_info()[1]) elif self.type == 'TEXT': try: file_text = read_file(self.path, 'rb') self.check_text_regexs(file_text, regexs, '') except WindowsError: self.set_error(sys.exc_info()[1]) except IOError: self.set_error(sys.exc_info()[1]) except: self.set_error(sys.exc_info()[1]) elif self.type == 'SPECIAL': if get_ext(self.path) == '.msg': try: msg = msmsg.MSMSG(self.path) if msg.validMSG: self.check_msg_regexs(msg, regexs, search_extensions, enable_pdf, '') else: self.set_error('Invalid MSG file') msg.close() except IOError: self.set_error(sys.exc_info()[1]) except: self.set_error(sys.exc_info()[1]) if enable_pdf: if get_ext(self.path) == '.pdf': try: pdf = pdfquery.PDFQuery(self.path) pdf.load() self.check_pdf_regexs(pdf, regexs, '') except: self.set_error(sys.exc_info()[1]) if get_ext(self.path) == '.mdb': try: self.check_access_regexs(self.path, 'mdb', regexs) except: self.set_error(sys.exc_info()[1]) return self.matches
def get_resume_in_text(filename): if filename.endswith('docx'): text = docx2txt.process(filename) elif filename.endswith('pdf'): with open(filename, 'r') as f: text = slate.PDF(f) text = text[0] elif filename.endswith('txt'): with open(filename, r) as f: text = f.read() return text.lower()
def upload(fileName): uploads = os.path.join(app.config['UPLOAD_FOLDER'], fileName) content = '' headers = {"Content-Disposition": "attachment; filename=%s" % fileName} if fileName[-4:] == '.txt' or fileName[-4:] == '.pdf': file = open(uploads, 'r+') content = file.read() file.close() elif fileName[-5:] == '.docx': content = docx2txt.process(uploads) return make_response((content, headers))
def _convert_docx_to_text(self,index, password=None): input_docx = self.cvFile inputPath = os.getcwd() if os.path.exists(input_docx): inputPath = os.path.dirname(input_docx) input_filename = os.path.basename(input_docx) input_parts = input_filename.split(".") input_parts.pop() text = docx2txt.process(input_docx) return text.encode('utf-8')
def parse_docx(filename): """ Parses a docx at filename """ text = docx2txt.process(filename) chart_list = text.split("\n") chart_list = [i for i in chart_list if i != '' and i != ' '] # Ignore blank lines # Just the chart_list items chart_list = chart_list[chart_list.index('1'):chart_list.index('30') + 5] chart = [] for i in range(0, len(chart_list), 5): chart.append({"artist": chart_list[i + 1], "title": chart_list[i + 2]}) return chart
def main(): token_dict = {} filename= "GC.docx" text = docx2txt.process(filename) lowers = text.lower() #no_punctuation = lowers.translate(None, string.punctuation) tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english') token_dict["gocardless"] = lowers tfs = tfidf.fit_transform(token_dict.values()) feature_names = tfidf.get_feature_names() for col in tfs.nonzero()[1]: print feature_names[col], ' - ', tfs[0, col]
def fileToArray(fileName): # identify file type split = str.split(fileName,".") fileType = split[len(split)-1] print 'File type is ' + str(fileType) + "." if fileType == 'docx': text = docx2txt.process(fileName).splitlines() else: # here: asumming a text file text = open(fileName).read().splitlines() return text
def generate_json_file(self): """Open and read the file, and then extract all the content that matchs the expressions""" try: if self.__filelocation is not None and self.__filelocation <> "": self.__filedata = docx2txt.process(self.__filelocation).encode("utf-8", "ignore") else: raise ValueError("File Path was not found.") self.__expressions = self.__set_expressions() self.__convert_to_json() except ValueError as valueerror: print "Erro nos parâmetros de entrada: " + valueerror.message except Exception as error: print "Erro na leitura dos dados do Arquivo: " + error.message
def parse_data(filename): txt = docx2txt.process(filename) # Split data into smaller pieces: tokens # split() is a method that separates text into words using # white-spaces when no argument is given tokens = txt.split() tokens = find_token("Sample", tokens) sample = next(tokens) tokens = find_token("FirstAngle", tokens) first_angle = float(next(tokens)) tokens = find_token("ScanRange", tokens) scan_range = float(next(tokens)) tokens = find_token("StepWidth", tokens) step_width = float(next(tokens)) tokens = find_token("ScanData", tokens) scan_data = list(map(float, tokens)) return (sample, first_angle, scan_range, step_width, scan_data)
def handleTextFiles(cls, file_path): file_path = file_path.lower() filename, file_extension = os.path.splitext(file_path) getFreqWords = True if (file_extension == ".txt"): with open(file_path) as inp: text = inp.read().split() elif (file_extension in [".docx", ".doc"]): text = docx2txt.process(file_path).split() text = [word.encode("utf-8") for word in text] if len(text) == 0: getFreqWords = False if getFreqWords: return MiscFunctions.getNFrequentWords(text, 3) else: return []
def convert_docx_to_txt(self, path): """ A very simple conversion function which returns unicode text for parsing. path = The path to the file """ # https://github.com/ankushshah89/python-docx2txt try: text = docx2txt.process(path) self.logger.debug("Converted docx to text: " + str(path)) return unicode(text) except Exception as e: text = "" return text self.logger.error( "Failed to DOCX to text: " + str(e))
def documentToText(path): if path[-4:] == ".doc": cmd = ['antiword', path] p = Popen(cmd, stdout=PIPE) stdout, stderr = p.communicate() return removeNonAscii(stdout) elif path[-5:] == ".docx": return removeNonAscii(doc.process(path)) elif path[-4:] == ".txt": inputFile = open(path) text = inputFile.read() #Because memory and such inputFile.close() return(removeNonAscii(text)) elif path[-4:] == ".pdf": return removeNonAscii(convert_pdf_to_txt(path)) elif path[-4:] == ".rtf": text = Rtf15Reader.read(open(path)) return removeNonAscii(PlaintextWriter.write(text).getvalue()) return "Returned Nothing."
def from_docx_func(url): download_url = url new_f = urllib.request.urlopen(download_url) length_downloadfile = new_f.headers['Content-length'] y = int(length_downloadfile) if y > 50000: return "sravan" ran_file = urllib.request.URLopener() ran_file.retrieve(download_url,"rand.docx") text = docx2txt.process("rand.docx") value = remove_non_ascii(text) value = value.replace('\n',' ') value = re.sub('_+',' ',value) value = value.replace('\t',' ') #value = value.replace('/',' ') value = value.replace('\n',' ') value = re.sub(' +',' ',value) #print("docs--------------------------------") #print(value) return value
def _convert_docx_to_text(self, password=None): print "Decoding docx file" input_docx = self.cvFile outputPath = self.scratchDir inputPath = os.getcwd() if os.path.exists(input_docx): inputPath = os.path.dirname(input_docx) input_filename = os.path.basename(input_docx) input_parts = input_filename.split(".") input_parts.pop() randomStr = int(time.time()) output_filename = outputPath + os.path.sep + ".".join(input_parts) + r".txt" output_filename = output_filename.replace (" ", "_") print "writing output to {0}".format(output_filename) # self.cvTextFile = output_filename text = docx2txt.process(input_docx) # print text fw = open(output_filename, "w") print "test" fw.write(text.encode('utf-8')) print "written sucessfully" fw.close() return(0)
def get_file(file_name, query): print("inside model:", file_name) ext = file_name.split(".")[-1] text = '' print("Found file with extension " + ext) if ext == 'docx': text = docx2txt.process(file_name) elif ext == 'txt': with open(file_name) as f: for line in f: text = text + line elif ext == 'xlsx': f = pd.ExcelFile(file_name) for names in f.sheet_names: sheet = pd.read_excel(f, names, header=None) for row in sheet.values: for w in row: text = text + str(w) elif ext == 'pdf': resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(file_name, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() converter.close() fake_file_handle.close() print(ext, len(text), type(text)) train_data = pd.read_csv("traindata.csv") sentences = re.split('\n', text) dataset_sentences = pd.DataFrame(sentences, columns=["sentences"]) null_sentences = dataset_sentences["sentences"] != '' dataset_sentences = dataset_sentences[null_sentences] final_sentence = [] for sent in dataset_sentences["sentences"]: final_sentence.append(sent.lstrip('0123456789. ')) final_df = pd.DataFrame(final_sentence, columns=["final_sentences"]) final_df["final_sentences"] = final_df["final_sentences"].str.replace( '"', '') punctuations = list("!:?.;,_%`()") for punct in punctuations: final_df["final_sentences"] = final_df["final_sentences"].str.replace( punct, '') final_df["final_sentences"] = final_df["final_sentences"].str.replace( "’s", '') punctuations2 = list("\-/") for punct2 in punctuations2: final_df["final_sentences"] = final_df["final_sentences"].str.replace( punct2, ' ') for i in range(2): final_df["final_sentences"] = final_df["final_sentences"].str.replace( " ", ' ') final_df["final_sentences"] = final_df["final_sentences"].str.lower() stop_words = list(stopwords.words('english')) stopwords_1 = [ "would", "able", "due", "one", "need", "co", "so4", "socio", "many", "small", "low", "go", "per" ] stopwords_final = stop_words + stopwords_1 key_words = [] for sentence in final_df["final_sentences"]: words = word_tokenize(sentence) for word in words: if word not in stopwords_final: key_words.append(word) lemmat = WordNetLemmatizer() lem_list = [lemmat.lemmatize(word, pos='v') for word in key_words] tag = nltk.pos_tag(lem_list) exclude_tag = [ "RBR", "RB", "JJS", "IN", "CD", "JJR", "NNP", "VBG", "MD", "CC", "VBD", "DT", "VBN" ] tagged_list = [] [tagged_list.append(x[0]) for x in tag if x[1] not in exclude_tag] keywords_d = [] [keywords_d.append(x) for x in tagged_list if x not in keywords_d] keywords_df = pd.DataFrame(keywords_d, columns=['keywords']) vector = Word2Vec([keywords_d], min_count=1) vector_all = [] for x in keywords_d: vector_all.append(vector[x].tolist()) X_train = list(train_data["keywords"]) y_train = list(train_data["prediction_numeric"]) vector1 = Word2Vec([X_train], min_count=1) vector_train1 = [] for x in X_train: vector_train1.append(vector1[x].tolist()) knn = KNeighborsClassifier(n_neighbors=5) knn.fit(vector_train1, y_train) keywords_df["prediction"] = knn.predict(vector_all) keywords_df["prediction_word"] = np.where( keywords_df["prediction"] == 1, "customer", np.where( keywords_df["prediction"] == 2, "employee", np.where( keywords_df["prediction"] == 3, "finance", np.where(keywords_df["prediction"] == 4, "industry", "management")))) final_text = "" for sent in final_sentence: final_text += sent + " " def get_topic(query): q_tokens = word_tokenize(query) q_tokens_pos = nltk.pos_tag(q_tokens) exclude_tag = [ "RBR", "JJS", "IN", "CD", "JJR", "NNP", "VBG", "MD", "CC", "VBD", "DT", "VBN", "VBZ", "WP", '.' ] q_tagged_list = [] [ q_tagged_list.append(x[0]) for x in q_tokens_pos if x[1] not in exclude_tag ] topic = [] for query_word in q_tagged_list: pred = keywords_df.loc[keywords_df["keywords"] == query_word] for i in pred["prediction_word"]: if i != 0: if i not in topic: topic.append(i) return topic def main_query(query): actual_query = query query = query.replace('?', '') new_text = "" new_sentences = "" new1 = "" if ext == "docx": passage = docx2txt.process(file_name) sentences = re.split('\n', passage) new_text = "" for i in sentences: if i != "": j = i.lstrip('0123456789. ') if (len(j) != len(i)): if new_text != "": new_text = new_text + " " + j else: new_text = new_text + j new1 = new_text new_sentences = sent_tokenize(new_text) print('inside docx') elif ext == 'txt': passage = "" with open(file_name) as f: for line in f: passage = passage + line sentences = re.split('\n', passage) new_text = "" print("Length of sentences generated :", len(sentences)) for i in sentences: if i != "": j = i.lstrip('0123456789. ') if (len(j) != len(i)): if new_text != "": new_text = new_text + " " + j else: new_text = new_text + j new_sentences = sent_tokenize(new_text) print('inside txt') elif ext == 'pdf': text = "" resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(file_name, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() converter.close() fake_file_handle.close() passage1 = text print("PDF") text_split = passage1.split() pdf_sent = "" for t in text_split: t = t.lstrip('0123456789. ') if t != "": if pdf_sent == "": pdf_sent = t + " " + pdf_sent else: pdf_sent = pdf_sent + " " + t print(pdf_sent) new_sentences = sent_tokenize(pdf_sent) print("PDF tokenize: ", len(new_sentences)) new_text = "" for sent in new_sentences: new_text = sent + new_text print('inside pdf') elif ext == "xlsx": text = "" f = pd.ExcelFile(file_name) for names in f.sheet_names: sheet = pd.read_excel(f, names, header=None) for row in sheet.values: for w in row: w = w.lstrip('0123456789. ') if text == "": text = text + str(w) else: text = text + " " + str(w) new_text = text new_sentences = sent_tokenize(new_text) print("xlsx tokenize: ", len(new_sentences)) print('inside excel') new2 = new_text print(new1 == new2) print(new_text) print(len(new_text)) if query.startswith('is') or query.startswith('does'): result = predictor.predict(passage=new_text, question=query) answer = result['best_span_str'] tokenized_doc = [] for d in final_df["final_sentences"]: tokenized_doc.append(word_tokenize(d.lower())) tagged_data = [ TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc) ] model = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4, epochs=100) model.save("test_doc2vec.model") model = Doc2Vec.load("test_doc2vec.model") q_tokens = word_tokenize(query) q_tokens_pos = nltk.pos_tag(q_tokens) exclude_tag = [ "RBR", "JJS", "IN", "CD", "JJR", "NNP", "VBG", "MD", "CC", "VBD", "DT", "VBN", "VBZ" ] q_tagged_list = [] [ q_tagged_list.append(x[0]) for x in q_tokens_pos if x[1] not in exclude_tag ] a_tokens = word_tokenize(answer) a_tokens_pos = nltk.pos_tag(a_tokens) exclude_tag = [ "RBR", "JJS", "IN", "CD", "JJR", "NNP", "VBG", "MD", "CC", "VBD", "DT", "VBN", "VBZ" ] a_tagged_list = [] [ a_tagged_list.append(x[0]) for x in a_tokens_pos if x[1] not in exclude_tag ] query_final = "" for i in q_tagged_list: query_final += i + " " answer_final = "" for i in a_tagged_list: answer_final += i + " " vec1 = model.infer_vector(query_final.split()) vec2 = model.infer_vector(answer_final.split()) similairty = spatial.distance.cosine(vec1, vec2) if ((similairty >= 0.005 and similairty <= 0.006) or (similairty >= 0.012 and similairty <= 0.022) or (similairty >= 0.0561 and similairty <= 0.0568)): return "No" else: return "Yes" else: if actual_query.endswith("?"): actual_query = actual_query else: actual_query = actual_query + "?" result = predictor.predict(passage=new_text, question=actual_query) answer = result['best_span_str'] similarity_value = [] print(len(new_sentences)) print('inside what questions : ') print(answer) for k in new_sentences: output_tokenize = word_tokenize(answer) k_tokenize = word_tokenize(k) sw = stopwords.words('english') l1 = [] l2 = [] output_set = {w for w in output_tokenize if not w in sw} k_set = {w for w in k_tokenize if not w in sw} rvector = output_set.union(k_set) for w in rvector: if w in output_set: l1.append(1) # create a vector else: l1.append(0) if w in k_set: l2.append(1) else: l2.append(0) c = 0 for i in range(len(rvector)): c += l1[i] * l2[i] cosine = c / float((sum(l1) * sum(l2))**0.5) similarity_value.append(cosine) print("Result : ") print(max(similarity_value)) print(new_sentences[similarity_value.index(max(similarity_value))]) answer = new_sentences[similarity_value.index( max(similarity_value))] return answer def datatype(query): Descriptive = [ 'what', 'which', 'who', 'whom', 'whose', 'why', 'where', 'how' ] Number = ['how much', 'how many', 'how old', 'how far'] Time = ['when', 'how long'] Boolean = ['is', 'does'] secondary_word = [ 'profit', 'sum', 'mean', 'percentage', 'total', 'loss', 'difference', 'age', 'average', 'maximum', 'minimum' ] query_words = word_tokenize(query) query_first_word = query_words[0] query_second_word = query_words[1] query_both_words = query_first_word + " " + query_second_word i = 0 for w in query_words[1:]: if w in secondary_word: i += 1 if query_first_word == 'what' and i > 0: ans_type = 'Numerical' elif query_both_words in Number: ans_type = 'Numerical' elif query_first_word in Time or query_both_words in Time: ans_type = 'Date/Time' elif query_first_word in Descriptive: ans_type = 'Text' elif query_first_word in Boolean: ans_type = 'Boolean' else: ans_type = 'Please enter valid question' return ans_type return main_query(query), get_topic(query), datatype(query)
def process_doxc(self, files): self.name = file return process(str(files))
docs_arr = documents.find({ 'file_name': {'$exists': True}, 'send_to_server': {'$exists': False}, 'file_extension': '.docx', 'error': {'$exists': False} }, no_cursor_timeout=True) conn = pymysql.connect(host='176.112.205.12', user='******', password='******', db='referats', charset='utf8mb4') cursor = conn.cursor() counter = 1 for doc in docs_arr: try: file_path = FILES_DIR + doc['file_name'] print(str(counter)+'. Sending... ' + doc['file_name']) text = docx2txt.process(file_path) sql = 'insert into documents (mongo_doc_id,title,content,file_name) values(%s, %s, %s, %s)' result = cursor.execute(sql, [ str(doc['_id']), doc['title'], docx2txt.process(file_path), doc['file_name'] ]) conn.commit() documents.update_one({'_id': doc['_id']}, {'$set': {'send_to_server': True}}) counter += 1 except zipfile.BadZipFile: print('Error BadZipFile') documents.update_one({'_id': doc['_id']}, {'$set': {'error': 'zipfile.BadZipFile'}}) counter += 1 continue
pd.read_excel('Eamcet Key-converted.xlsx', sheet_name=1, usecols=[10, 11]) ] questions = {} for dat in data: for index, row in dat.iterrows(): questions[row[0]] = row[1] # print(questions) # document = Document('Eamcet Sravan Response Sheet-converted.docx') # fullText = [] # for para in document.paragraphs: # fullText.append(para.text) # print(fullText) ques_text = docx2txt.process("filename.docx") ques_list = ques_text.split() # print(ques_list) q_a = [] for i in range(len(ques_list)): if ('Question' in ques_list[i]): if (ques_list[i + 1] == 'Type'): continue q = int(ques_list[i + 3]) ans = int(ques_list[i + 7]) q_a.append([q, ans]) # print(q_a) score = 0
def read_file(self): self.content = docx2txt.process(self.path)
Citation examples used in the sample text (random authors and titles with random dates) (Sabbagh, 2009) (Sabbagh, n.d.) (Sabbagh, 2010a) (Sabbagh, 2010b) (Qianyi Gu & Sumner, 2006) (Despotovic-Zrakic et al., 2012) (Anonymous, 2010) (Anonymous, n.d.) (“Barcelona to Ban Burqa,” 2010) """ import docx2txt as docx import re # Open the document text = docx.process("lorem_sample.docx") # Specifiy a very large number so that each `replace()` call catches\ # all stylized double quotes num_replaces = 100000000 # Replace stylized doubled quotes by the default double quotes # https://www.w3schools.com/charsets/ref_utf_punctuation.asp text = text.replace('“', '"', num_replaces).replace('”', '"', num_replaces).replace( '„', '"', num_replaces).replace('‟', '"', num_replaces) # Text between double quotes: https://stackoverflow.com/a/378447/9263761 # Pattern to find all types of citations pattern = r'\(([^"\)]*|\bAnonymous\b|"[^"\)]*")(, )([\d]+|n\.d\.|[\d]+[\w])\)' """ \( -> opening parentheses
def main_query(query): actual_query = query query = query.replace('?', '') new_text = "" new_sentences = "" new1 = "" if ext == "docx": passage = docx2txt.process(file_name) sentences = re.split('\n', passage) new_text = "" for i in sentences: if i != "": j = i.lstrip('0123456789. ') if (len(j) != len(i)): if new_text != "": new_text = new_text + " " + j else: new_text = new_text + j new1 = new_text new_sentences = sent_tokenize(new_text) print('inside docx') elif ext == 'txt': passage = "" with open(file_name) as f: for line in f: passage = passage + line sentences = re.split('\n', passage) new_text = "" print("Length of sentences generated :", len(sentences)) for i in sentences: if i != "": j = i.lstrip('0123456789. ') if (len(j) != len(i)): if new_text != "": new_text = new_text + " " + j else: new_text = new_text + j new_sentences = sent_tokenize(new_text) print('inside txt') elif ext == 'pdf': text = "" resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(file_name, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() converter.close() fake_file_handle.close() passage1 = text print("PDF") text_split = passage1.split() pdf_sent = "" for t in text_split: t = t.lstrip('0123456789. ') if t != "": if pdf_sent == "": pdf_sent = t + " " + pdf_sent else: pdf_sent = pdf_sent + " " + t print(pdf_sent) new_sentences = sent_tokenize(pdf_sent) print("PDF tokenize: ", len(new_sentences)) new_text = "" for sent in new_sentences: new_text = sent + new_text print('inside pdf') elif ext == "xlsx": text = "" f = pd.ExcelFile(file_name) for names in f.sheet_names: sheet = pd.read_excel(f, names, header=None) for row in sheet.values: for w in row: w = w.lstrip('0123456789. ') if text == "": text = text + str(w) else: text = text + " " + str(w) new_text = text new_sentences = sent_tokenize(new_text) print("xlsx tokenize: ", len(new_sentences)) print('inside excel') new2 = new_text print(new1 == new2) print(new_text) print(len(new_text)) if query.startswith('is') or query.startswith('does'): result = predictor.predict(passage=new_text, question=query) answer = result['best_span_str'] tokenized_doc = [] for d in final_df["final_sentences"]: tokenized_doc.append(word_tokenize(d.lower())) tagged_data = [ TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc) ] model = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4, epochs=100) model.save("test_doc2vec.model") model = Doc2Vec.load("test_doc2vec.model") q_tokens = word_tokenize(query) q_tokens_pos = nltk.pos_tag(q_tokens) exclude_tag = [ "RBR", "JJS", "IN", "CD", "JJR", "NNP", "VBG", "MD", "CC", "VBD", "DT", "VBN", "VBZ" ] q_tagged_list = [] [ q_tagged_list.append(x[0]) for x in q_tokens_pos if x[1] not in exclude_tag ] a_tokens = word_tokenize(answer) a_tokens_pos = nltk.pos_tag(a_tokens) exclude_tag = [ "RBR", "JJS", "IN", "CD", "JJR", "NNP", "VBG", "MD", "CC", "VBD", "DT", "VBN", "VBZ" ] a_tagged_list = [] [ a_tagged_list.append(x[0]) for x in a_tokens_pos if x[1] not in exclude_tag ] query_final = "" for i in q_tagged_list: query_final += i + " " answer_final = "" for i in a_tagged_list: answer_final += i + " " vec1 = model.infer_vector(query_final.split()) vec2 = model.infer_vector(answer_final.split()) similairty = spatial.distance.cosine(vec1, vec2) if ((similairty >= 0.005 and similairty <= 0.006) or (similairty >= 0.012 and similairty <= 0.022) or (similairty >= 0.0561 and similairty <= 0.0568)): return "No" else: return "Yes" else: if actual_query.endswith("?"): actual_query = actual_query else: actual_query = actual_query + "?" result = predictor.predict(passage=new_text, question=actual_query) answer = result['best_span_str'] similarity_value = [] print(len(new_sentences)) print('inside what questions : ') print(answer) for k in new_sentences: output_tokenize = word_tokenize(answer) k_tokenize = word_tokenize(k) sw = stopwords.words('english') l1 = [] l2 = [] output_set = {w for w in output_tokenize if not w in sw} k_set = {w for w in k_tokenize if not w in sw} rvector = output_set.union(k_set) for w in rvector: if w in output_set: l1.append(1) # create a vector else: l1.append(0) if w in k_set: l2.append(1) else: l2.append(0) c = 0 for i in range(len(rvector)): c += l1[i] * l2[i] cosine = c / float((sum(l1) * sum(l2))**0.5) similarity_value.append(cosine) print("Result : ") print(max(similarity_value)) print(new_sentences[similarity_value.index(max(similarity_value))]) answer = new_sentences[similarity_value.index( max(similarity_value))] return answer
import docx2txt import pandas as pd import docx from docx import Document import nltk as nl import pandas as pd import numpy as np my_text = docx2txt.process("sample1_cv.docx") document = Document('sample1_cv.docx') lst = pd.read_csv("repository.csv") a = my_text.encode("ISO-8859-1", "ignore") headings = [] tables = document.tables for table in tables: for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: headings.append(paragraph.text) headings = [str(k) for k in headings] sections = a.split('\n\n') sec_no = 0 paras = [''] for sec in sections: if sec in headings: sec_no += 1 paras.append('') continue paras[sec_no] += sec + "\n" import nltk tokens = [] token_final = []
parser.add_argument("--label_file_path", type=str, default="", help="Path to the label data file") args = parser.parse_args() path = args.folder_path label_file = args.label_file_path if not os.path.exists(path): print("path does not exist: %s" % path) if not os.path.exists(label_file): print(" set proper label file path, path does not exists %s" % label_file) sys.exit(0) df = pd.read_csv(label_file) list_doc = glob.glob(path + "/*docx") new_label_data = [] for file_path in list_doc: File_name = os.path.basename(file_path)[:-9] text = docx2txt.process(file_path) list_sentence = [x.replace("\n", " ") for x in sent_tokenize(text)] row = df.loc[df['File Name'] == File_name] label_data = trim_entity_spans(get_all_sentence(list_sentence, row)) new_label_data += label_data if not os.path.exists('pickle_files'): os.makedirs('pickle_files') pickle_file = '_'.join(path.strip().split('/')[-1].split()) + '.pickle' with open('pickle_files/' + pickle_file, 'wb') as f: pickle.dump(new_label_data, f)
def read_pc_doc_file(doc_file_name): try: text = docx2txt.process(doc_file_name) print 'Doc file %s found'%(doc_file_name) except: print 'No Doc file %s found'%(doc_file_name) return [] index = 0 start_text = 'The said goods may be allowed to be cleared at NIL rate of duty in terms of Notification 52/2003-Cus dated 31.03.2006, under intimation to the undersigned.' end_text = 'OFFICE OF THE SUPERINTENDENT OF CENTRAL EXCISE, WAGHOLI' address_list = [] while index < len(text): start = text.find(start_text, index) + len(start_text) end = text.find(end_text, start) if end >=0: address_text = text[start:end] address_list.append(text[start:end].strip().replace('\n\n', '\n')) index = end else: address_text = text[start:] address_list.append(text[start:].strip().replace('\n\n', '\n')) index = len(text) index = 0 start_text = 'Balance in B-17 Bond Rs.' end_text = 'The said goods may be allowed to be cleared at NIL rate of duty in terms of Notification 52/2003-Cus dated 31.03.2006, under intimation to the undersigned.' balance_list = [] # print text while index < len(text): start = text.find(start_text, index) + len(start_text) if start < len(start_text): break end = text.find(end_text, start) balance_text = text[start:end].strip() balance_list.append(balance_text) index = end index = 0 start_text = 'This registration authorizes them to obtain/clear material from' end_text = 'for re' short_address_list = [] # print text while index < len(text): start = text.find(start_text, index) + len(start_text) if start < len(start_text): break end = text.find(end_text, start) short_address_text = text[start:end].strip() short_address_list.append(short_address_text) index = end document = docx.Document(doc_file_name) table_list = [] tables = document.tables for i in range(1,len(tables), 2): table = tables[i] table_list.append({}) table_list[-1]['sr_no'] = get_text_from_cell(table.rows[1].cells[0]) table_list[-1]['description_of_goods'] = get_text_from_cell(table.rows[1].cells[1]) table_list[-1]['qty'] = get_text_from_cell(table.rows[1].cells[2]) table_list[-1]['cif_value'] = get_text_from_cell(table.rows[1].cells[3]) table_list[-1]['total'] = get_text_from_cell(table.rows[2].cells[3]) table_list[-1]['address'] = address_list[i/2] table_list[-1]['balance'] = balance_list[i/2] table_list[-1]['short_address'] = short_address_list[i/2] return table_list
import os import docx2txt import time os.chdir('C:\\Users\\vprakas\\Desktop\\python\\kpi') a = os.listdir('C:\\Users\\vprakas\\Desktop\\python\\kpi') #fullText=[] for i in a: try: b = i.split("_") cc = b[0] print(cc) text = docx2txt.process(i) b = text.splitlines() #print(b) for x in range(len(b)): if "Maintenance Window" in b[x]: changetime = str(b[x + 2:x + 3]) print(changetime) if "Affected Devices" in b[x]: for z in b[x:x + 2]: q = z servername = q.replace("Affected Devices:", "") print(servername) print("*" * 100) except: pass
def wordextract(file): text = docx2txt.process(file) return text
def plot(): plt.ylabel('Noun') plt.xlabel('Frequency') plt.xticks(rotation='vertical') plt.title('Noun vs Frequency') plt.hist(sorted_list, rwidth=0.85, bins=len(counter)) plt.show() Filename = input('Enter File Path:\n') # If file is in docx format, it is converted to txt before being processed for noun extraction if Filename.endswith('.docx'): lines = docx2txt.process(Filename) noun_extraction() getfrequency() plot() else: File = open(Filename) lines = File.read() noun_extraction() getfrequency() plot()
wcs = doc.BuiltInDocumentProperties("Number of Words") wordcounts.append(str(wcs)) lastsave = doc.BuiltInDocumentProperties("Creation Date") lastsavel.append(str(lastsave)) doc.Close() print("Authors found!") print("Wordcounts done!") print("Creation dates found!") #now to convert to plain text editednobr = [] originalnobr = [] for i in efilenames: edited = docx2txt.process(i) edited = edited.replace('\r', '').replace('\n', '') editednobr.append(edited) for i in ofilenames: original = docx2txt.process(i) original = original.replace('\r', '').replace('\n', '') originalnobr.append(original) #split into sections -- ORIGINAL headingol = [] backdigestol = [] argumentsol = [] notesol = [] for i in originalnobr:
# -*- coding: utf-8 -*- import requests import xml.etree.ElementTree as ET import codecs import docx2txt import numpy import codecs a=docx2txt.process("TextInput.docx") a=a.replace(u'“', '') a=a.replace(u'”', '') request = u"""<?xml version="1.0" encoding="utf-8"?> <soapenv:Envelope xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/" xmlns:web="http://webFdgRo.uaic/"> <soapenv:Header/> <soapenv:Body> <web:parseText> <txt>""" + a + u"""</txt> </web:parseText> </soapenv:Body> </soapenv:Envelope>""" encoded_request = request.encode('utf-8') headers = {"Host": "www.webFdgRo.uaic", "Content-Type": "text/xml; charset=UTF-8", "Content-Length": str(len(encoded_request))}
print(row) import json with open("file.json") as file: content = json.load(file) content.get("bişi..") import xml.etree.ElementTree as ET content = ET.parse("jile.xml") root = content.getroot() for child in root: print(child.tag, child.text) import docx2txt content = docx2txt.process("file.docx") print(content) from PyPDF2 import PdfFileReader file = open("file.pdf", "rb") content = PdfFileReader(file) text = content.getPage(0) text.extractText() content.numPages #%% Site engelleme import time from datetime import datetime as dt
import docx2txt import re import os base_folder = './docs'; documents = os.listdir(base_folder) filtered_docs=[] text='' ending_with_docs= ['docx','doc'] print "Started parsing documents........" for doc in documents: if doc.split(".")[1] == ending_with_docs[0].lower() or doc.split(".")[1]==ending_with_docs[1].lower(): filtered_docs.append(doc) text=text+docx2txt.process('./docs/'+doc) else: print "Invalid file(Not doc/docx)"+doc print "Finished parsing documents" regex_email=re.findall(r"[\w\S]+[@]\w+[.]\w{3,}",text) print 'Emails are:' print regex_email
def get_info(fileDir, files_array): while True: try: #print(os.path.exists(fileDir, fileDir)) #files_array = [_ for _ in os.listdir(fileDir) if _.endswith(fileExt)] indice = 0 image_array = [] table_chunks = [] table_return = [] for file in files_array: # # raspagem de numero de página do arquivo word ( x.group()) Dados dentro do arquivo app.xml # pagina_xml_code = subprocess.Popen( ["unzip", "-p", fileDir + file, "docProps/app.xml"], stdout=subprocess.PIPE) output = pagina_xml_code.communicate()[0] #print(output) pagina_xml_str = output.decode("utf-8") x = re.search('(?<=\<Pages\>)(.*)(?=\<\/Pages\>)', pagina_xml_str) # print(file) #try: # pagina_xml = x.group() #except ValueError: # print("Oops! XML não encontrado") try: pagina_xml = x.group() except: print("Oops! XML não encontrado para file: " + file) continue #print(pagina_xml) # abrir conecção com o arquivo Word #doc = docx.Document(fileDir+'/'+file) doc = docx.Document(fileDir + file) # data criação dt_doc = doc.core_properties.created # ler em cada parágrafo dentro do arquivo Word paragra = [p.text for p in doc.paragraphs] # extai a quantidade de caracteres #caracteres = docx2txt.process(fileDir+'/'+file) content = docx2txt.process(fileDir + file) #### novo codigo caracteres = [] for line in content.splitlines(): #This will ignore empty/blank lines. if line != '': #Append to list caracteres.append(len(line)) # fim novo codigo #### novo codigo # pegando quantidade de estilos que o docuento word possui # colocando na variável (array_styles) from docx.enum.style import WD_STYLE_TYPE styles = doc.styles array_styles = [] paragraph_styles = [ s for s in styles if s.type == WD_STYLE_TYPE.PARAGRAPH ] for style in paragraph_styles: array_styles.append(style.name) #print(style.name) #print(len(array_styles)) # fim novo codigo # gerar um array com tamanho de todas as imagens for image in doc.inline_shapes: image_array.append([image.width, image.height]) # print (image.width, image.height) # gerar um array com todas as tabelas for table in doc.tables: table_chunks.append(table) # gerar lista resposta list_retur = file[2:8], pagina_xml, sum(caracteres), len( table_chunks), len(image_array), len( array_styles), dt_doc.date() # gerar tabela resposta table_return.append(list_retur) #return(files_array[indice][2:8],len(caracteres), len(table_chunks), len(image_array),dt_doc.date()) return table_return #break except ValueError: print( "Oops! erro no método verificar diret'ório e extensão do arquivos no argumento desse método" )
def convertDocx(self, filename): rawtext = docx2txt.process(filename) f = open(filename[:-5] + '_processed.txt', 'w') f.write(rawtext) f.close()
regex1 = re.compile( r"\b(?:Question 1: What do you think are the current challenges to sustainable development in the (Mekong Lancang|MekongLancang) region\?)(?P<Answer1>.*?)(?:Question 2: What does regional cooperation mean to you\? What are the opportunities for regional cooperation to support sustainable development in the Mekong- Lancang\?)(?P<Answer2>.*?)(?:Question 3: From your experience, are there examples where some or all of the Mekong-Lancang countries have cooperated to yield a clear and positive trans-boundary river management outcome\?)(?P<Answer3>.*?)(?:Question 4: What are the relative advantages\/merits of the different mechanisms for cooperation, and do you see any opportunities for improvements\?)(?P<Answer4>.*?)(?:Question 5: In your opinion, when cooperation occurs between Lancang-Mekong countries, what indicates its success\? How do you know if cooperation is successful\?)(?P<Answer5>.*?)(?:Question 6: From your experience, for what types of Lancang-Mekong problems has cooperation been most effective\?)(?P<Answer6>.*?)(?:Question 7: In your view, which factors prevent cooperation\? And which factors enable it\?)(?P<Answer7>.*?)(?:Question 8: From your experience, when Lancang-Mekong countries cooperate for sustainable development of the basin, who are the most influential actors\?)(?P<Answer8>.*?)(?:Question 9: In your opinion, how can governments balance natural resources sustainability with economic development goals\?)(?P<Answer9>.*)\b" ) regex2 = re.compile( r"Interview with (?P<Name>.*?,)(?P<Org>.*?,)(?P<Country>.*?,)") columns = [ "File Name", "Answer1", "Answer2", "Answer3", "Answer4", "Answer5", "Answer6", "Answer7", "Answer8", "Answer9" ] df = pd.DataFrame(columns=columns) #%% file_data = {} index = 0 for file in onlyfiles: my_text = docx2txt.process(file) shorted_file_name = re.sub(".docx", "", file) text_file_name = shorted_file_name + ".txt" print(text_file_name) with open(text_file_name, "w", encoding='utf-8') as text_file: print(my_text, file=text_file) f = open(text_file_name, 'r', encoding='utf-8') content = f.readlines() new_list = [] for element in content: element = re.sub(r'(\s+\\n)|(\\n)', '', element.strip( )) # this is the line we add to strip the newline character new_list.append(element) new_string = ([ element.replace("\\u00a0", " ").encode('ascii', 'ignore').decode() for element in new_list if element != ""
def build_index_text_docx(file_name): return docx2txt.process(file_name).replace('\n\n', ' ')
def import_convert_preprocess(url, extension): global doc_count global crawled_web_dir_preprocessed global crawled_web_dir global crawled_web_dir_conv_need global page_doc_map url_map_name = url if (url_map_name not in page_doc_map): page_doc_map[url_map_name] = -1 page_ref_count[url_map_name] = 1 try: doc_count_temp = doc_count + 1 book_name = "" if extension == "pdf": book_name = str(doc_count_temp) + ".pdf" elif extension == "docx": book_name = str(doc_count_temp) + ".docx" elif extension == "pptx": book_name = str(doc_count_temp) + ".pptx" book_path = crawled_web_dir_conv_need + "\\" + book_name a = requests.get(url, stream=True) with open(book_path, 'wb') as book: for block in a.iter_content(512): if not block: break book.write(block) book.close() file_name = str(doc_count_temp) + ".txt" file_path = crawled_web_dir + "\\" + file_name is_valid_for_indexing = 555 if extension == "pdf": pdf_to_text(book_path, file_name) is_valid_for_indexing = preprocess_one_doc_from_pdf( crawled_web_dir, file_name, crawled_web_dir_preprocessed) elif extension == "docx": text = docx2txt.process(book_path) save_text(text, crawled_web_dir, file_name) is_valid_for_indexing = preprocess_one_doc( crawled_web_dir, file_name, crawled_web_dir_preprocessed) elif extension == "pptx": text = pptx_to_text(book_path) save_text(text, crawled_web_dir, file_name) is_valid_for_indexing = preprocess_one_doc( crawled_web_dir, file_name, crawled_web_dir_preprocessed) if (is_valid_for_indexing == 1): doc_count = doc_count + 1 page_doc_map[url_map_name] = doc_count doc_page_map[doc_count] = url_map_name page_ref_count[url_map_name] = 1 else: delete_file(book_path) delete_file(file_path) page_doc_map[url_map_name] = -2 except IOError: page_doc_map[url_map_name] = -1 else: page_ref_count[url_map_name] = page_ref_count[url_map_name] + 1
def index(request): djtext1 = request.POST['t1'] djtext2 = request.POST['t2'] state1 = request.POST.get('x1', 'off') state2 = request.POST.get('x2', 'off') flag = 0 m1 = len(djtext1) m2 = len(djtext2) if m1 == 18 and m2 == 18: flag = 1 if state1 == "on" or state2 == "on": if state1 == "on": url1 = request.POST.get('u1') r1 = requests.get(url1) htmlcontent1 = r1.content soup1 = BeautifulSoup(htmlcontent1, 'html.parser') link = soup1.find('td').get_text() # link=link[:4000] if state2 == "on": url2 = request.POST.get('u2') r2 = requests.get(url2) htmlcontent2 = r2.content soup2 = BeautifulSoup(htmlcontent2, 'html.parser') link = soup2.find('article').get_text() # link=link[:4000] if m1 != 18: #common text list common="" matches = difflib.SequenceMatcher( None, djtext1, link).get_matching_blocks() for match in matches: #print (t1[match.a:match.a + match.size]) common+=djtext1[match.a:match.a + match.size] seq = difflib.SequenceMatcher(None, djtext1, common) d = seq.ratio()*100 d = round(d, 2) x = "" x = str(d) # if common==djtext1 or common==link: # x="100" report(x,djtext1,link,common) params = {'text1': djtext1, 'text2': link, 'res': flag, 'len1': m1, 'len2': m2, 'ans': x,'com':common} return render(request, 'result.html', params) elif request.method == 'POST': bfile1 = request.FILES['f1'] ext1 = bfile1.name if ext1[-1] == 'x': z = docx2txt.process(bfile1) elif ext1[-1] == 'f': x1 = PyPDF2.PdfFileReader(bfile1) z = " " num = x1.getNumPages() for i in range(1, num): z += x1.getPage(i).extractText() seq = difflib.SequenceMatcher(None, link, z) d = seq.ratio()*100 d = round(d, 2) x = " " x = str(d) common="" matches = difflib.SequenceMatcher( None, link, z).get_matching_blocks() for match in matches: common+=link[match.a:match.a + match.size] report(x,link,z,common) params = {'text1': z, 'text2': link, 'res': flag, 'len1': m1, 'len2': m2, 'ans': x,'com':common} return render(request, 'result.html', params) elif request.method == 'POST' and flag == 1: bfile1 = request.FILES['f1'] bfile2 = request.FILES['f2'] ext1 = bfile1.name ext2 = bfile2.name if ext1[-1] == 'x': z = docx2txt.process(bfile1) elif ext1[-1] == 'f': x1 = PyPDF2.PdfFileReader(bfile1) z = " " num = x1.getNumPages() for i in range(1, num): z += x1.getPage(i).extractText() if ext2[-1] == 'x': y = docx2txt.process(bfile2) elif ext2[-1] == 'f': x2 = PyPDF2.PdfFileReader(bfile2) y = " " num = x2.getNumPages() for i in range(1, num): y += x2.getPage(i).extractText() seq = difflib.SequenceMatcher(None, y, z) d = seq.ratio()*100 d = round(d, 2) x = " " x = str(d) common="" matches = difflib.SequenceMatcher( None, y, z).get_matching_blocks() for match in matches: common+=y[match.a:match.a + match.size] if common==y or common==z: x="100" report(x,y,z,common) params = {'text1': z, 'text2': y, 'res': flag, 'len1': m1, 'len2': m2, 'ans': x,'com':common} return render(request, 'result.html', params) else: common="" matches = difflib.SequenceMatcher( None, djtext1, djtext2).get_matching_blocks() for match in matches: common+=djtext1[match.a:match.a + match.size] seq = difflib.SequenceMatcher(None, djtext1, djtext2) d = seq.ratio()*100 d = round(d, 2) x = "" x = str(d) # if common==djtext1 or common==djtext2: # x="100" report(x,djtext1,djtext2,common) params = {'text1': djtext1, 'text2': djtext2, 'res': flag, 'len1': m1, 'len2': m2, 'ans': x,'com':common} return render(request, 'result.html', params)
import docx2txt import os def ensure_dir(f): d = os.path.dirname(f) if not os.path.exists(d): os.makedirs(d) basedir = "Relatos Esquizofrenia/" file = "Relatos Esquizofrenia/Relato A/Control/1.a.docx" test = docx2txt.process(file) for subdir, dirs, files in os.walk(basedir): for file in files: filedir = os.path.join(subdir, file) output = docx2txt.process(filedir) split = filedir.split("/")[1] split = split.split("\\") print split ensure_dir("txts\\" + split[0] + "\\") f = open( "txts\\" + split[0] + "\\" + split[1] + split[2].split('.')[0] + ".txt", "w") f.write(output.encode("utf8")) f.close()
#Takes in the text and runs all the functions required for creating a summary from it. def run_summarization(text): # 1 Create the word frequency table freq_table = _create_frequency_table(text) ''' We already have a sentence tokenizer, so we just need to run the sent_tokenize() method to create the array of sentences. ''' # 2 Tokenize the sentences sentences = sent_tokenize(text) # 3 Important Algorithm: score the sentences sentence_scores = _score_sentences(sentences, freq_table) # 4 Find the threshold threshold = _find_average_score(sentence_scores) # 5 Important Algorithm: Generate the summary summary = _generate_summary(sentences, sentence_scores, 1.5 * threshold) return summary #In main, start running summarization on the document specified. if __name__ == '__main__': document = docx2txt.process( "(Edited) Copy of Sources_Sought_Trainer_LFTS.docx") result = run_summarization(document) print(result)
import os from gtts import gTTS import docx2txt # pip install gTTS # pip install docx2txt # to play, put this file in the current dir. #word to text convertor replace docfilename filename = "COVID19anditsimpactonsports" MY_TEXT = docx2txt.process(filename + ".docx") with open(filename + ".txt", "w") as text_file: print(MY_TEXT, file=text_file) # You will need a text file named test.txt FLIST = open(filename + ".txt", "r").read().replace("\n", " ") print("please wait...processing") TTS = gTTS(text=str(FLIST), lang='en-us') # Save to mp3 in current dir. TTS.save(filename + ".mp3") # Plays the mp3 using the default app on your system # that is linked to mp3s. print("Process Done Now File has store in your current dir.") os.system(filename + ".mp3")
import nltk import docx2txt filename= "GC.docx" text = docx2txt.process(filename) def unusual_words(text): text_vocab = set(w.lower() for w in text if w.isalpha()) english_vocab = set(w.lower() for w in nltk.corpus.words.words()) unusual = text_vocab - english_vocab return sorted(unusual) token_list = nltk.word_tokenize(text) stopwords_removed = [word for word in token_list if word not in stopwords.words('english')] def filter_duplicates(text_list): output_list = [] for word in text_list: if word not in output_list: output_list.append(word) return output_list from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() stemmed = [] for word in filtered: stemmed.append(st.stem(word)) """
def getDocxContent(filename): DocxText = docx2txt.process(filename) return DocxText
folder = int(hierachy[-1]) #print(files) for filename in files: #if file_counter > 100: #break tu = [] '''filepath for each file''' f = root +"/"+ filename #print(file) '''read file content''' text_dataframe = d2t.process(f) if len(text_dataframe) > mask: text_dataframe = text_dataframe[:mask] elif len(text_dataframe) < mask: continue input_ids = torch.tensor(tokenizer.encode(text_dataframe)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states, mems, hidden_states = outputs #print(len(last_hidden_states)) #print(len(last_hidden_states[0])) #print(len(last_hidden_states[0][0]))
sheets = workbook.sheet_names() required_data = [] for sheet_name in sheets: sh = workbook.sheet_by_name(sheet_name) for rownum in range(sh.nrows): row_values = sh.row_values(rownum) required_data.append((row_values[4])) required_data2 = [] for sheet_name in sheets: sh = workbook.sheet_by_name(sheet_name) for rownum in range(sh.nrows): row_values = sh.row_values(rownum) required_data2.append((row_values[5])) required_data1 = list(filter(None, required_data)) z = os.getcwd() text = docx2txt.process(z + "\\SampleInputDoc1-FAQs.docx") blob = TextBlob(text) tokenizer = BlanklineTokenizer() z = blob.tokenize(tokenizer) c = '?' lst = list() for i in range(0, len(z)): x = z[i].find(c) if x != -1: lst.append(z[i]) import xlsxwriter workbook = xlsxwriter.Workbook('SampleOutput.xlsx') worksheet = workbook.add_worksheet()
def convert_docx_to_txt(self, path): # https://github.com/ankushshah89/python-docx2txt # Very simple setup of python-docx to text text = docx2txt.process(path) return unicode(text)
Trying to figure out some stuff relating to capitalization and the feasibility of using it to help determine if it will work in our program The usability seems to be limited insofar as it is too vague. While in conjunction with another form of search it may work, as most words that are capitalized are the requesting entities, the headings in a specific section, among other things. """ import docx2txt import spacy from spacy.lang.en.stop_words import STOP_WORDS #The name of the file we wish to process document1 = docx2txt.process("Copy of Sources Sought Synopsis Manuals 8 Jan 2020.docx") stopwords = list(STOP_WORDS) len(stopwords) #Load the medium spacy package so there is at least vectors for calculating similarity nlp = spacy.load('en_core_web_md') docx = nlp(document1) #An array that will hold the important words, IE those capitalized importantWords = [] for word in docx: if word.text not in stopwords:
def docx_2_txt(file): print(os.path.abspath(file)) text = docx2txt.process(file) with open('{}.txt'.format(file.split('.')[0]), 'w') as fl: fl.write(text)
import docx2txt my_text = docx2txt.process("/home/narendra/Desktop/leather fact.docx") print(my_text)
def read_txt(doc): read1 = docx2txt.process(doc) read = read1.splitlines() # split document to lists on new line read = [x for x in read if not x.isdigit()] # remove number from words read = [x for x in read if x] # remove empty list return read
def convert_docx_to_txt(path): return docx2txt.process(path)
def docx_reader(file): text = docx2txt.process(open(file, 'rb')) # print(text) return text
#!/usr/bin/env python # coding: utf-8 # In[1]: import docx2txt my_text = docx2txt.process("Basic inbuilt commands.docx") print(my_text) # In[3]: # filter() # what filter function do ? # filter(function, iterable) # function: A Function to be run for each item in the iterable # iterable: The iterable to be filtered # for example: you define name = [xvc, ohg, ger, gea, y4eg,] then name is iterable # In[50]: x = float("3.5000000") x
def extract_text_from_docx(docx_path): txt = docx2txt.process(docx_path) if txt: return txt.replace('\t', ' ') return None
def extract(self, filename, **kwargs): return docx2txt.process(filename)