Esempio n. 1
0
def agreement(request):
    f = None
    context = {}

    try:
        doc = docxpy.process('static/doc/개인정보 수집·이용 동의.docx')
    except FileNotFoundError:
        context['agreement1'] = '개인정보 수집·이용 약관을 불러오는데 실패했습니다.'
    else:
        context['agreement1'] = doc
    finally:
        if not (f is None):
            f.close()

    try:
        doc = docxpy.process('static/doc/개인정보 처리 동침 동의.docx')
    except FileNotFoundError:
        context['agreement2'] = '개인정보 처리 동침 동의 약관을 불러오는데 실패했습니다.'
    else:
        context['agreement2'] = doc
    finally:
        if not (f is None):
            f.close()

    return render(request, 'account/agreement.html', context)
def word_counter(docx_path) : 

    text = docxpy.process(docx_path).replace(":", " ")
    words = [word for word in text.split() if not word in [":", "."]]
    text = " ".join(docxpy.process(docx_path).split())
    
    return len(words), text
Esempio n. 3
0
    def filedialog(self):
        #root.filename = tkFileDialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("jpeg files","*.jpg"),("all files","*.*")))
        
        self.filename = filedialog.askopenfilename(initialdir = "/", title = "Select file")
        self.entry.insert(0,self.filename)
        self.text = docxpy.process(self.filename)
        print(self.text)
        self.N=len(self.text)
        print("length=",N)
        self.dictionary=set()
        self.postings=defaultdict(dict)
        self.document_frequency=defaultdict(dict)
        #self.length=defaultdict(float)

        
        self.txt=ScrolledText(self.labelFrame, width=100, height=20)
        self.txt.grid(row=7)
        self.txt.insert(const.END,self.text)
        #self.label1.config(text=self.text)
       # self.entry1.insert(0,self.text)
        self.terms=self.token()
        print(self.terms)
        self.unique_terms = set(self.terms)
        print(self.unique_terms)
        self.dictionary=dictionary.union(self.unique_terms)
        for each term in unique_terms:
            self.postings[term]=self.terms.count(term)
def TextSpeechConverter(fldr_name):
    start_Time = time.time()
    for filename in glob(fldr_name + "*"):
        print(filename)
        text = docxpy.process(filename)

        path_of_output = "C:/Users/user/Documents/"  #this is the path to generate file
        try:
            os.mkdir(os.path.join(path_of_output, "text"))
        except:
            pass

        text_file = filename.split("\\")[-1].replace(".docx", ".txt")
        with open("C:/Users/user/Documents/text/" + text_file,
                  "w",
                  encoding='utf-8') as file:
            file.write(text)
        try:
            os.mkdir(os.path.join(path_of_output, "audio"))
        except:
            pass

        audio_file = filename.split("\\")[-1].replace(".docx", ".mp3")
        audio_obj = Text2Speech(text)
        audio_obj.save(os.path.join(path_of_output, "audio/") + audio_file)

        end_Time = time.time()
        execution_time = (end_Time - start_Time) / 60
        print("Total Execution Time: ", execution_time)
def find_files():
    word = txtFrame3.get()
    path = txtFrame4.get()
    print(path)
    all_file_names = os.listdir(
        path)  #returns a list of all file names present in given dir
    pointer = float(result_text_word.index(INSERT))
    result_text_word.insert(pointer, ("Total Files\t:\t" +
                                      (f'{len(all_file_names)}') + "\n\n"))
    pointer += 1.0

    docx_file_names = []
    for filename in all_file_names:
        if (filename.endswith('.docx')):
            docx_file_names.append(filename)
    result_text_word.insert(pointer, ("Docx Files\t:\t" +
                                      (f'{len(docx_file_names)}') + "\n\n"))
    pointer = float(result_text_word.index(INSERT))
    pointer += 1.0

    result_file_names = []
    for filename in docx_file_names:
        text = docxpy.process(path + '/' + filename)
        if (word in text):
            result_file_names.append(filename)
    result_text_word.insert(pointer, ("Matched Files\t:\t" +
                                      (f'{len(result_file_names)}') + "\n\n"))
    pointer = float(result_text_word.index(INSERT))
    pointer += 1.0
    for file in result_file_names:
        result_text_word.insert(pointer, (f'\t{file}\n'))
Esempio n. 6
0
def convertToText(file):

    try:
        if file.endswith('.pdf'):

            PDF = PdfFileReader(open(file, 'rb'), strict=True)
            if PDF.isEncrypted:
                decrypt = PDF.decrypt('')
                if decrypt == 0:
                    raise Exception("Nope")

                elif decrypt == 1 or decrypt == 2:
                    pass

            text = ''
            r = PDF.getPage(1)
            print(r.getContents())
            for page in PDF.pages:
                text = text + page.extractText()
                #print(text)
            return text

        else:

            text = docxpy.process(file)
            return text

    except Exception as e:
        return e
Esempio n. 7
0
def docx(i,o,ip,op):
   print 'docx'
   text = docxpy.process(ip+i)
   f=open(op+'%s'%o,'w')
   text=char_preprocess(text)
   f.write(text)
   f.close()
Esempio n. 8
0
def search(dir_path, keyword):
    matched_file_list = []
    all_docx_files = []
    all_file_list = os.listdir(dir_path)
    for file in all_file_list:
        if (file.endswith('.docx')):
            all_docx_files.append(file)
            text = dx.process(f'{dir_path}/{file}')
            if (keyword.lower() in text.lower()):
                matched_file_list.append(file)
    return all_file_list, all_docx_files, matched_file_list
Esempio n. 9
0
    def getResumeNouns(self):

        resumeNounsList = []

        for file in os.listdir(self.resume):
            if file.endswith(('.docx', '.pdf')):
                saveNouns = []
                if file.endswith('.pdf'):
                    PDF = PdfFileReader(open(file, 'rb'))
                    if PDF.isEncrypted:
                        decrypt = PDF.decrypt('')
                        if decrypt == 0:
                            print("Password Protected PDF: " + file)
                            raise Exception("Nope")
                        elif decrypt == 1 or decrypt == 2:
                            print("Successfully Decrypted PDF")

                    text = ''
                    for page in PDF.pages:
                        text = text + "\n" + page.extractText()

                    jdTokens = nltk.word_tokenize(text)
                    stemmer = PorterStemmer()
                    stemmed = [stemmer.stem(token) for token in jdTokens]
                    jdTagged = nltk.pos_tag(stemmed)

                    p = re.compile('NN|NNS|NNP|NNPS|PRP|PRP$|WP|WP$')
                    # store nouns from job description
                    for tag in jdTagged:
                        t = tag[1]
                        if p.match(t):
                            saveNouns.append(tag[0])

                else:
                    text = docxpy.process(file)
                    jdTokens = nltk.word_tokenize(text)
                    stemmer = PorterStemmer()
                    stemmed = [stemmer.stem(token) for token in jdTokens]
                    jdTagged = nltk.pos_tag(stemmed)
                    p = re.compile('NN|NNS|NNP|NNPS|PRP|PRP$|WP|WP$')

                    # store nouns from job description
                    for tag in jdTagged:
                        t = tag[1]
                        if p.match(t):
                            saveNouns.append(tag[0])

                resumeNounsList.append({
                    'filename': file,
                    'list': saveNouns,
                    'text': text
                })

        return resumeNounsList
def con(filenames, code, counter,subdir):
    if not os.path.exists('input/'+subdir):
        os.makedirs('input/'+subdir)
    for file in filenames:
        counter = counter+1
        text = docxpy.process(file)
        saveFile = open("input/"+subdir+'/'+str(code)+str(counter).zfill(4)+ ".txt", 'w', encoding="utf-8")
        saveFile.write(text)
        saveFile.close()

    return counter

#con()
Esempio n. 11
0
 def filedialog(self):
     #root.filename = tkFileDialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("jpeg files","*.jpg"),("all files","*.*")))
     
     self.filename = filedialog.askopenfilename(initialdir = "/", title = "Select file")
     self.entry.insert(0,self.filename)
     self.text = docxpy.process(self.filename)
     print(self.text)
     self.txt=ScrolledText(self, width=150, height=10)
     self.txt.grid()
     self.txt.insert(const.END,self.text)
     #self.label1.config(text=self.text)
     self.entry1.insert(0,self.text)
     print(textwrap.fill(self.text))
Esempio n. 12
0
def docx_extract(file, i):
    delete_ppms()
    j = 0
    for file in sorted(os.listdir('Resumes')):
        if '.docx' in file and 'file' not in file:
            os.rename(PATH + file,
                      PATH + 'file' + str(i) + '_' + str(j) + '.docx')
            j += 1
    j = 0
    f = open(f'result{i}.txt', 'w')
    files = [f for f in os.listdir('Resumes') if '.docx' in f]
    for file in sorted(files,
                       key=lambda x: int(x[x.index('-') + 1:x.index('.')])):
        #print("extracting from:", file)
        temp = docxpy.process(PATH + file)
        f.write(temp)
    f.close()
Esempio n. 13
0
    def filedialog(self):
        #root.filename = tkFileDialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("jpeg files","*.jpg"),("all files","*.*")))

        self.filename = filedialog.askopenfilename(initialdir="/",
                                                   title="Select file")
        self.entry.insert(0, self.filename)
        self.text = docxpy.process(self.filename)
        print(self.text)
        self.txt = ScrolledText(self.labelFrame, width=100, height=20)
        self.txt.grid(row=7)
        self.txt.insert(const.END, self.text)
        #self.label1.config(text=self.text)
        # self.entry1.insert(0,self.text)
        self.terms = self.token()
        print(self.terms)
        self.unique_terms = set(self.terms)
        print(self.unique_terms)
        #self.searchweb()
        self.summary()
Esempio n. 14
0
def printtext(file):
    file_name, file_extension = os.path.splitext(file)
    junk, filenamenopath = os.path.split(file_name)
    filename = file_name + ".txt"
    if (file_extension == ".docx"):
        text = docxpy.process(file)
        doc = docxpy.DOCReader(file)
        doc.process()  # process file
    elif (file_extension == ".pdf"):
        text = extract_text(file)
    elif (file_extension == ".txt"):
        text = open(file, "r").read()
    else:
        text = "I don't know this file type"
    text = re.sub('[^A-Za-z0-9\s]+', '', text)
    assignmentfile = open("../assignment.txt", "w")
    assignmentfile.write(text)
    #print text
    return text
Esempio n. 15
0
def convert(request, pk):
    document = get_object_or_404(Document, pk=pk)
    print(document.document)
    translator = Translator()
    parse(f"D:\\audio\\audio\\static\\" + str(document.document),
          f"D:\\audio\\audio\\static\\" + str(document.document)[:-4] +
          ".docx",
          start=0)
    print("PDF --> DOCX")
    # extract text
    text = docxpy.process(f"D:\\audio\\audio\\static\\" +
                          str(document.document)[:-4] + ".docx")
    print("DOCX --> TXT")
    print(text)
    print("Translating...")
    translator = Translator()
    translation = translator.translate(text, dest="hi")
    print("English --> Hindi")
    print(translation.text)

    context = {'document': document}
    return render(request, 'audio_detail.html', context)
Esempio n. 16
0
    def filedialog(self):
        #root.filename = tkFileDialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("jpeg files","*.jpg"),("all files","*.*")))
        
        self.filename = filedialog.askopenfilename(initialdir = "/", title = "Select file")
        self.entry.insert(0,self.filename)
        ext=self.filename.split(".")[-1]
        if ext=='doc' or ext=='docx':
            self.text = docxpy.process(self.filename)
        if ext=='pdf':
            
            obj=open(self.filename,'rb')
            reader=PdfFileReader(obj)
            num=reader.numPages
            print (num)
            page=reader.getPage(0)
            self.text=page.extractText()

            obj.close()
    
        print(self.text)
        self.txt=ScrolledText(self.labelFrame, width=50, height=20)
        self.txt.grid(row=1,column=3)
        self.txt.insert(const.END,self.text)
        self.button1()
def getWords(doc):
    if (doc.split('.')[1] == 'txt'):
        text_string = open(doc).read()
    else:
        text_string = docxpy.process(doc)

    fullwordlist = stripNonAlphaNum(text_string)
    str1 = ' '.join(fullwordlist)
    match_pattern = re.findall(r'\b[a-z]{3,30}\b', str1)
    wordlist = removeStopwords(match_pattern, stopwords)
    dictionary = wordListToFreqDict(wordlist)

    sorted_freq = [
        pair[0]
        for pair in sorted(dictionary.items(), key=lambda item: item[1])
    ]
    sorted_freq.reverse()
    sorted_list = []
    sorted_num = []
    for words in sorted_freq[:10]:
        sorted_list.append(words)
        sorted_num.append(dictionary.get(words))

    return sorted_list, sorted_num
Esempio n. 18
0
def grab(file_path):
    text = docxpy.process(file_path)
    text = filter(text)
    return text
Esempio n. 19
0
import docxpy
import os
files = os.listdir('docs')
for f in files:
    print(f)
    target = "docs/" + str(f)
    # print(type(target))
    text = docxpy.process(target, "img")
                   'Competence'] = extract_competencies(text, experience_list)
            df.loc[i, 'competence score'] = extract_competencies_score(
                text, experience_list)
            df.loc[i, 'DOB'] = extract_dob(text)

        else:
            df.loc[i, 'Total Experience(in months)'] = 'NA'
            df.loc[i, 'Last Position'] = 'NA'
            df.loc[i, 'Competence'] = 'NA'
            df.loc[i, 'competence score'] = 'NA'
            df.loc[i, 'DOB'] = extract_dob(text)
        i += 1

    else:

        text = docxpy.process(filename)
        # print(text)

        df.loc[i, 'Mobile No.'] = extract_mobile_number(text)
        df.loc[i, 'Email'] = extract_email(text)
        df.loc[i, 'Name'] = extract_name(text)
        df.loc[i, 'Education Qualifications'] = extract_education(text)
        df.loc[i, 'Skills'] = extract_skills(text)
        experience_list1 = extract_entity_sections_grad(text)

        if 'experience' in experience_list1:

            experience_list = experience_list1['experience']
            df.loc[i, 'Total Experience(in months)'] = get_total_experience(
                experience_list)
            df.loc[i, 'Last Position'] = extract_experience(text)
Esempio n. 21
0
def main_docx(file):
    doctext = docxpy.process(file)
    doc_item = str(doctext).split()
    return doc_item
Esempio n. 22
0
Author: Milind Kumar V
"""

import re
import numpy as np
import docxpy

################### defining variables ###################

path_ts = "./38331-f80.docx"
path_out_text = "./definitions.txt"

################### process the file ###################

contents = docxpy.process(path_ts)

contents = contents.strip().split("\n")

start_tag = "-- ASN1START"
stop_tag = "-- ASN1STOP"

with open(path_out_text, "w") as file:
    copy = False
    for line_idx in range(len(contents)):
        line = contents[line_idx]
        if line.strip() == start_tag:
            copy = True
            continue
        elif line.strip() == stop_tag:
            copy = False
Esempio n. 23
0
def resume_to_str(filename, path="saved-resumes/"):
    dot_index = filename.rindex(".")
    extension = filename[dot_index:]
    if extension.lower() == ".pdf":
        return pdf_to_str(filename, path)
    return docxpy.process(path + filename)
def docx_to_text(file_path):
    document = docxpy.process(file_path)

    return document
Esempio n. 25
0
import docxpy

file = 'C:/Users/user/Documents/malayalam.docx'

# extract text
text = docxpy.process(file)

# extract text and write images in /tmp/img_dir
#text = docxpy.process(file, "/tmp/img_dir")
print(text)
import docxpy
import pandas as pd
from bs4 import BeautifulSoup as bs
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
#File Location
f='D:\Downloads\Basic-Resume-Template_Word\Word\Basic Resume Template.docx'
text=docxpy.process(f)

#soup=bs(text,'lxml')
print("---------------------")
#df=pd.read_csv("D:\Downloads\skills.csv")
#df2=df['SKILLS']
skills=['python','php','java','javascript','perl','android','c++','c','wordpress','photoshop','graphic design','cloud','asp.net','vb.net','.net','mysql','ajax','microcontroller','data structure','data mining','data warehouse','data plane','sql','selenium','qtp','web developer','bigdata','big data','ios developer','swift','flutter','hadoop','csharp','c sharp','c#','css','html','matlab','angular js','angular','nodejs','node js','node.js','angular.js','reactjs','react js','react.js','vuejs','vue js','vue.js','ui developer','jsp','jquery','rdbms','xhtml','dhtml','appdjango','vbscript','embedded c','app developer','wireless','iot','spring','pascal','javabeans','java beans','linux','azure','firebase','aws','xml','fortran','data analysis','solidity','amazon web service','redhat','alexa','cyber security','cordova','automation','drupal','machine learning','artificial intelligence','blockchain','hyperledger','ethereum','data migration','json','swing','backbone','ado','lamp','arduino','image processing','ccna','flask','mongodb','ruby','hibernate','oracle','ibm db2','teradata','ms access','joomla','indesign','acrobat','corel draw','docker','kubernetes','cobol','robotics','cyber security','augmented reality','virtual reality','autocad','solidworks','illustrator','ui/ux','ux/ui']
text2=text.replace("\n"," ")
text2=text2.lower()
#print(text2)
list1=[]
#search=''
for i in skills:
    if(i in text2):
#        print(i)
        list1.append(i)
 #       search=search+' '+i
#print(list1)
s=''
s=','.join(i for i in list1)

#print("String: "+s)
Esempio n. 27
0
#%%
import docxpy
import docx2txt
import unicodedata

# %%
text = docxpy.process('data/article one multiple topics.docx')

# %%
doc = docxpy.DOCReader('data/articletwo.docx')

# %%
text = doc.process()

# %%
import docx2txt


# %%
text = docx2txt.process('data/articletwo.docx')

# %%
from textsummarization import *
# %%
methods = ['bert_sum']
for i in methods:
    print(i)
    print(extract_sum(text, 0.5, i))
# %%
import torch
import json 
Esempio n. 28
0
#         pa ="data/"+os.path.splitext(file)[0]+".txt"
#         fw = open(pa,"w")
#         fw.writelines(da)
#         fw.close()
# break
# from dateutil.parser import parse
# print(parse('Jun 06'))

# import pandas as pd
# import numpy as np
# df = pd.Series([1,2,np.nan])
# print(df.head(1))
# print(df.index)
# print(np.random.randn(6,4))
# D12316
# import docx
#
# def getText(filename):
#     doc = docx.Document(filename)
#     fullText = []
#     for para in doc.paragraphs:
#         print(para.text)
#         fullText.append(para.text)
#     return '\n'.join(fullText)
path = "doc_file/Resume_Multiple_Email_Mobile_Rows.docx"
# print(getText(path))

import docxpy
text = docxpy.process(path)
print(text)
def get_text_from_docx(docx_path) : 
    file = docx_path
    text = " ".join(docxpy.process(file).split())
    return text