Example #1
0
def parse():
    '''
    Парсинг docx
    '''
    i = []
    if check_found_file(TMP + '/2.docx'):
        doc = docx2txt.process(TMP + '/2.docx')
    else:
        print('Отсутствует файл docx!!!')
    for line in doc.splitlines():
        if line == '':
            continue
        elif line[:4] == 'Фото':
            photolist = line[5:].split(', ')
            photos = []
            for photo in photolist:
                photos.append(
                    '/files/img/' +
                    str(datetime.date.today()) +
                    '/SMALL' + photo.strip() + '.JPG'
                )
            i.append({'photo': photos, 'size': len(photolist)})
        else:
            i.append({'paragraph': line.rstrip()})
    return i
Example #2
0
def execute(info_footnotes):
    info=[]
    rd.open_location("/DOCX",True)
    for filename in os.listdir(os.getcwd()):

        if filename.endswith(".docx"):
            headings_list,pages=get_headings(filename)
            rd.open_location("/DOCX",False)
            raw_text=docx2txt.process(filename)
            raw_text_lower=raw_text.lower()
            TOC=has_TOC(headings_list,raw_text_lower)
            iainfo=gi.execute(raw_text_lower)
            links=combine(raw_text,filename,info_footnotes)
           
            if headings_list==None:
               back=get_references2(raw_text_lower)
               me=find_m(raw_text_lower)
               if back[0]=="no list":
                   info.append([filename,back[0],me,pages,iainfo,links[0],links[1],links[2],TOC])
               else:
                   info.append([filename,back[0],me,pages,iainfo,links[0],links[1],links[2],TOC])
            else:    
                references=get_references(headings_list,raw_text)
                monitoring_and_evaluation=get_monitoring_and_evaluation(headings_list,raw_text)
                info.append([filename,references,monitoring_and_evaluation,pages,iainfo,links[0],links[1],links[2],TOC])            
                
    return info        
Example #3
0
def process_narratives():
    narrative_outputs = {}
    narrative_list = os.listdir('narratives')
    for narrative in narrative_list:
        if not narrative == '.DS_Store':
            #don't try and read the hidden files
            text = docx2txt.process('narratives'+narrative)
            tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
Example #4
0
    def check_regexs(self, regexs, search_extensions, enable_pdf):
        """Checks the file for matching regular expressions: if a ZIP then each file in the ZIP (recursively) or the text in a document"""

        if self.type == 'ZIP':
            try:
                if get_ext(self.path) == '.docx':
                    doctext = docx2txt.process(self.path)
                    self.check_text_regexs(doctext, regexs, '')
                    
                if zipfile.is_zipfile(self.path):
                    zf = zipfile.ZipFile(self.path)
                    self.check_zip_regexs(zf, regexs, search_extensions, enable_pdf, '')                                             
                else:
                    self.set_error('Invalid ZIP file')
            except IOError:
                self.set_error(sys.exc_info()[1])
            except:
                self.set_error(sys.exc_info()[1])

        elif self.type == 'TEXT':
            try:
                file_text = read_file(self.path, 'rb')
                self.check_text_regexs(file_text, regexs, '')
            except WindowsError:
                self.set_error(sys.exc_info()[1])
            except IOError:
                self.set_error(sys.exc_info()[1])
            except:
                self.set_error(sys.exc_info()[1])

        elif self.type == 'SPECIAL':
            if get_ext(self.path) == '.msg':
                try:
                    msg = msmsg.MSMSG(self.path)
                    if msg.validMSG:
                        self.check_msg_regexs(msg, regexs, search_extensions, enable_pdf, '')
                    else:
                        self.set_error('Invalid MSG file')
                    msg.close()
                except IOError:
                    self.set_error(sys.exc_info()[1])
                except:
                    self.set_error(sys.exc_info()[1])
            if enable_pdf:
                if get_ext(self.path) == '.pdf':
                    try:
                        pdf = pdfquery.PDFQuery(self.path)
                        pdf.load()
                        self.check_pdf_regexs(pdf, regexs, '')
                    except:
                        self.set_error(sys.exc_info()[1])
            if get_ext(self.path) == '.mdb':
                try:
                    self.check_access_regexs(self.path, 'mdb', regexs)
                except:
                    self.set_error(sys.exc_info()[1])
                
        return self.matches
Example #5
0
def get_resume_in_text(filename):
    if filename.endswith('docx'):
        text = docx2txt.process(filename)
    elif filename.endswith('pdf'):
        with open(filename, 'r') as f:
            text = slate.PDF(f)
            text = text[0]
    elif filename.endswith('txt'):
        with open(filename, r) as f:
            text = f.read()
    return text.lower()
Example #6
0
def upload(fileName):
    uploads = os.path.join(app.config['UPLOAD_FOLDER'], fileName)
    content = ''
    headers = {"Content-Disposition": "attachment; filename=%s" % fileName}
    if fileName[-4:] == '.txt' or fileName[-4:] == '.pdf':
        file = open(uploads, 'r+')
        content = file.read()
        file.close()
    elif fileName[-5:] == '.docx':
        content = docx2txt.process(uploads)
    return make_response((content, headers))
Example #7
0
    def _convert_docx_to_text(self,index, password=None):

        input_docx = self.cvFile

        inputPath = os.getcwd()
        if os.path.exists(input_docx):
            inputPath = os.path.dirname(input_docx)
        input_filename = os.path.basename(input_docx)
        input_parts = input_filename.split(".")
        input_parts.pop()
        text = docx2txt.process(input_docx)

        return text.encode('utf-8')
Example #8
0
def parse_docx(filename):
    """
    Parses a docx at filename
    """
    text = docx2txt.process(filename)
    chart_list = text.split("\n")
    chart_list = [i for i in chart_list if i != '' and i != ' '] # Ignore blank lines
    # Just the chart_list items
    chart_list = chart_list[chart_list.index('1'):chart_list.index('30') + 5]
    chart = []
    for i in range(0, len(chart_list), 5):
        chart.append({"artist": chart_list[i + 1], "title": chart_list[i + 2]})
    return chart
Example #9
0
def main():
    token_dict = {}
    filename= "GC.docx"
    text = docx2txt.process(filename)
    lowers = text.lower()
    #no_punctuation = lowers.translate(None, string.punctuation)
    tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')

    token_dict["gocardless"] = lowers
    tfs = tfidf.fit_transform(token_dict.values())
    feature_names = tfidf.get_feature_names()
    for col in tfs.nonzero()[1]:
        print feature_names[col], ' - ', tfs[0, col]
Example #10
0
def fileToArray(fileName):

    # identify file type
    split    = str.split(fileName,".")
    fileType = split[len(split)-1]
    print 'File type is ' + str(fileType) + "."

    if fileType == 'docx':
        text = docx2txt.process(fileName).splitlines()

    else:
        # here: asumming a text file
        text = open(fileName).read().splitlines()

    return text
Example #11
0
    def generate_json_file(self):
        """Open and read the file, and then extract all the content that matchs the expressions"""
        try:
            if self.__filelocation is not None and self.__filelocation <> "":
                self.__filedata = docx2txt.process(self.__filelocation).encode("utf-8", "ignore")
            else:
                raise ValueError("File Path was not found.")

            self.__expressions = self.__set_expressions()
            self.__convert_to_json()

        except ValueError as valueerror:
            print "Erro nos parâmetros de entrada: " + valueerror.message
        except Exception as error:
            print "Erro na leitura dos dados do Arquivo: " + error.message
Example #12
0
def parse_data(filename):
    txt = docx2txt.process(filename)
    # Split data into smaller pieces: tokens
    # split() is a method that separates text into words using
    # white-spaces when no argument is given
    tokens = txt.split()
    tokens = find_token("Sample", tokens)
    sample = next(tokens)
    tokens = find_token("FirstAngle", tokens)
    first_angle = float(next(tokens))
    tokens = find_token("ScanRange", tokens)
    scan_range = float(next(tokens))
    tokens = find_token("StepWidth", tokens)
    step_width = float(next(tokens))
    tokens = find_token("ScanData", tokens)
    scan_data = list(map(float, tokens))
    return (sample, first_angle, scan_range, step_width, scan_data)
Example #13
0
    def handleTextFiles(cls, file_path):

        file_path = file_path.lower()
        filename, file_extension = os.path.splitext(file_path)
        getFreqWords = True
        if (file_extension == ".txt"):
            with open(file_path) as inp:
                text = inp.read().split()
        elif (file_extension in [".docx", ".doc"]):
            text = docx2txt.process(file_path).split()
            text = [word.encode("utf-8") for word in text]
            if len(text) == 0:
                getFreqWords = False
        if getFreqWords:
            return MiscFunctions.getNFrequentWords(text, 3)
        else:
            return []
Example #14
0
    def convert_docx_to_txt(self, path):
        """
        A very simple conversion function
        which returns unicode text for
        parsing.

        path = The path to the file
        """
        # https://github.com/ankushshah89/python-docx2txt
        try:
            text = docx2txt.process(path)
            self.logger.debug("Converted docx to text: " + str(path))
            return unicode(text)
        except Exception as e:
            text = ""
            return text
            self.logger.error(
                "Failed to DOCX to text: " + str(e))
Example #15
0
def documentToText(path):
    if path[-4:] == ".doc":
        cmd = ['antiword', path]
        p = Popen(cmd, stdout=PIPE)
        stdout, stderr = p.communicate()
        return removeNonAscii(stdout)
    elif path[-5:] == ".docx":
        return removeNonAscii(doc.process(path))
    elif path[-4:] == ".txt":
        inputFile = open(path)
        text = inputFile.read() #Because memory and such
        inputFile.close()
        return(removeNonAscii(text))
    elif path[-4:] == ".pdf":
        return removeNonAscii(convert_pdf_to_txt(path))
    elif path[-4:] == ".rtf":
        text = Rtf15Reader.read(open(path))
        return removeNonAscii(PlaintextWriter.write(text).getvalue())
    return "Returned Nothing."
def from_docx_func(url):
	download_url = url
	new_f = urllib.request.urlopen(download_url)
	length_downloadfile = new_f.headers['Content-length']
	y = int(length_downloadfile)
	if y > 50000:
		return "sravan"
	ran_file = urllib.request.URLopener()
	ran_file.retrieve(download_url,"rand.docx")
	text = docx2txt.process("rand.docx")
	value = remove_non_ascii(text)
	value = value.replace('\n',' ')
	value = re.sub('_+',' ',value)
	value = value.replace('\t',' ')
	#value = value.replace('/',' ')
	value = value.replace('\n',' ')
	value = re.sub(' +',' ',value)
	#print("docs--------------------------------")
	#print(value)
	return value
    def _convert_docx_to_text(self, password=None):
        print "Decoding docx file"
        input_docx = self.cvFile
        outputPath = self.scratchDir
        inputPath = os.getcwd()
        if os.path.exists(input_docx):
            inputPath = os.path.dirname(input_docx)
        input_filename = os.path.basename(input_docx)
        input_parts = input_filename.split(".")
        input_parts.pop()
        randomStr = int(time.time())
        output_filename = outputPath + os.path.sep + ".".join(input_parts)  + r".txt"
        output_filename = output_filename.replace (" ", "_")
        print "writing output to {0}".format(output_filename)

      #  self.cvTextFile = output_filename
        text = docx2txt.process(input_docx)
      #  print text
        fw = open(output_filename, "w")
        print "test"
        fw.write(text.encode('utf-8'))
        print "written sucessfully"
        fw.close()
        return(0)
def get_file(file_name, query):

    print("inside model:", file_name)
    ext = file_name.split(".")[-1]
    text = ''
    print("Found file with extension " + ext)

    if ext == 'docx':
        text = docx2txt.process(file_name)

    elif ext == 'txt':
        with open(file_name) as f:
            for line in f:
                text = text + line

    elif ext == 'xlsx':
        f = pd.ExcelFile(file_name)
        for names in f.sheet_names:
            sheet = pd.read_excel(f, names, header=None)
            for row in sheet.values:
                for w in row:
                    text = text + str(w)

    elif ext == 'pdf':
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        converter = TextConverter(resource_manager, fake_file_handle)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        with open(file_name, 'rb') as fh:
            for page in PDFPage.get_pages(fh,
                                          caching=True,
                                          check_extractable=True):
                page_interpreter.process_page(page)
            text = fake_file_handle.getvalue()
        converter.close()
        fake_file_handle.close()

    print(ext, len(text), type(text))

    train_data = pd.read_csv("traindata.csv")

    sentences = re.split('\n', text)

    dataset_sentences = pd.DataFrame(sentences, columns=["sentences"])

    null_sentences = dataset_sentences["sentences"] != ''
    dataset_sentences = dataset_sentences[null_sentences]
    final_sentence = []
    for sent in dataset_sentences["sentences"]:
        final_sentence.append(sent.lstrip('0123456789. '))

    final_df = pd.DataFrame(final_sentence, columns=["final_sentences"])

    final_df["final_sentences"] = final_df["final_sentences"].str.replace(
        '"', '')
    punctuations = list("!:?.;,_%`()")
    for punct in punctuations:
        final_df["final_sentences"] = final_df["final_sentences"].str.replace(
            punct, '')

    final_df["final_sentences"] = final_df["final_sentences"].str.replace(
        "’s", '')

    punctuations2 = list("\-/")
    for punct2 in punctuations2:
        final_df["final_sentences"] = final_df["final_sentences"].str.replace(
            punct2, ' ')
    for i in range(2):
        final_df["final_sentences"] = final_df["final_sentences"].str.replace(
            "  ", ' ')

    final_df["final_sentences"] = final_df["final_sentences"].str.lower()

    stop_words = list(stopwords.words('english'))
    stopwords_1 = [
        "would", "able", "due", "one", "need", "co", "so4", "socio", "many",
        "small", "low", "go", "per"
    ]
    stopwords_final = stop_words + stopwords_1
    key_words = []

    for sentence in final_df["final_sentences"]:
        words = word_tokenize(sentence)
        for word in words:
            if word not in stopwords_final:
                key_words.append(word)

    lemmat = WordNetLemmatizer()
    lem_list = [lemmat.lemmatize(word, pos='v') for word in key_words]

    tag = nltk.pos_tag(lem_list)
    exclude_tag = [
        "RBR", "RB", "JJS", "IN", "CD", "JJR", "NNP", "VBG", "MD", "CC", "VBD",
        "DT", "VBN"
    ]
    tagged_list = []
    [tagged_list.append(x[0]) for x in tag if x[1] not in exclude_tag]

    keywords_d = []
    [keywords_d.append(x) for x in tagged_list if x not in keywords_d]
    keywords_df = pd.DataFrame(keywords_d, columns=['keywords'])

    vector = Word2Vec([keywords_d], min_count=1)
    vector_all = []
    for x in keywords_d:
        vector_all.append(vector[x].tolist())

    X_train = list(train_data["keywords"])
    y_train = list(train_data["prediction_numeric"])

    vector1 = Word2Vec([X_train], min_count=1)
    vector_train1 = []
    for x in X_train:
        vector_train1.append(vector1[x].tolist())

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(vector_train1, y_train)

    keywords_df["prediction"] = knn.predict(vector_all)

    keywords_df["prediction_word"] = np.where(
        keywords_df["prediction"] == 1, "customer",
        np.where(
            keywords_df["prediction"] == 2, "employee",
            np.where(
                keywords_df["prediction"] == 3, "finance",
                np.where(keywords_df["prediction"] == 4, "industry",
                         "management"))))

    final_text = ""
    for sent in final_sentence:
        final_text += sent + " "

    def get_topic(query):
        q_tokens = word_tokenize(query)
        q_tokens_pos = nltk.pos_tag(q_tokens)
        exclude_tag = [
            "RBR", "JJS", "IN", "CD", "JJR", "NNP", "VBG", "MD", "CC", "VBD",
            "DT", "VBN", "VBZ", "WP", '.'
        ]
        q_tagged_list = []
        [
            q_tagged_list.append(x[0]) for x in q_tokens_pos
            if x[1] not in exclude_tag
        ]

        topic = []

        for query_word in q_tagged_list:
            pred = keywords_df.loc[keywords_df["keywords"] == query_word]
            for i in pred["prediction_word"]:
                if i != 0:
                    if i not in topic:
                        topic.append(i)

        return topic

    def main_query(query):
        actual_query = query
        query = query.replace('?', '')
        new_text = ""
        new_sentences = ""
        new1 = ""

        if ext == "docx":
            passage = docx2txt.process(file_name)
            sentences = re.split('\n', passage)
            new_text = ""
            for i in sentences:
                if i != "":
                    j = i.lstrip('0123456789. ')
                    if (len(j) != len(i)):
                        if new_text != "":
                            new_text = new_text + " " + j
                        else:
                            new_text = new_text + j
            new1 = new_text
            new_sentences = sent_tokenize(new_text)
            print('inside docx')

        elif ext == 'txt':
            passage = ""
            with open(file_name) as f:
                for line in f:
                    passage = passage + line
            sentences = re.split('\n', passage)
            new_text = ""
            print("Length of sentences generated :", len(sentences))
            for i in sentences:
                if i != "":
                    j = i.lstrip('0123456789. ')
                    if (len(j) != len(i)):
                        if new_text != "":
                            new_text = new_text + " " + j
                        else:
                            new_text = new_text + j

            new_sentences = sent_tokenize(new_text)
            print('inside txt')

        elif ext == 'pdf':
            text = ""
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            with open(file_name, 'rb') as fh:
                for page in PDFPage.get_pages(fh,
                                              caching=True,
                                              check_extractable=True):
                    page_interpreter.process_page(page)
                text = fake_file_handle.getvalue()
            converter.close()
            fake_file_handle.close()
            passage1 = text
            print("PDF")
            text_split = passage1.split()
            pdf_sent = ""
            for t in text_split:
                t = t.lstrip('0123456789. ')
                if t != "":
                    if pdf_sent == "":
                        pdf_sent = t + " " + pdf_sent
                    else:
                        pdf_sent = pdf_sent + " " + t

            print(pdf_sent)

            new_sentences = sent_tokenize(pdf_sent)
            print("PDF tokenize: ", len(new_sentences))

            new_text = ""
            for sent in new_sentences:
                new_text = sent + new_text

            print('inside pdf')

        elif ext == "xlsx":
            text = ""
            f = pd.ExcelFile(file_name)
            for names in f.sheet_names:
                sheet = pd.read_excel(f, names, header=None)
                for row in sheet.values:
                    for w in row:
                        w = w.lstrip('0123456789. ')
                        if text == "":
                            text = text + str(w)
                        else:
                            text = text + " " + str(w)

            new_text = text
            new_sentences = sent_tokenize(new_text)
            print("xlsx tokenize: ", len(new_sentences))
            print('inside excel')

        new2 = new_text
        print(new1 == new2)
        print(new_text)
        print(len(new_text))

        if query.startswith('is') or query.startswith('does'):

            result = predictor.predict(passage=new_text, question=query)
            answer = result['best_span_str']

            tokenized_doc = []

            for d in final_df["final_sentences"]:
                tokenized_doc.append(word_tokenize(d.lower()))

            tagged_data = [
                TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)
            ]

            model = Doc2Vec(tagged_data,
                            vector_size=20,
                            window=2,
                            min_count=1,
                            workers=4,
                            epochs=100)
            model.save("test_doc2vec.model")
            model = Doc2Vec.load("test_doc2vec.model")

            q_tokens = word_tokenize(query)
            q_tokens_pos = nltk.pos_tag(q_tokens)
            exclude_tag = [
                "RBR", "JJS", "IN", "CD", "JJR", "NNP", "VBG", "MD", "CC",
                "VBD", "DT", "VBN", "VBZ"
            ]
            q_tagged_list = []
            [
                q_tagged_list.append(x[0]) for x in q_tokens_pos
                if x[1] not in exclude_tag
            ]

            a_tokens = word_tokenize(answer)
            a_tokens_pos = nltk.pos_tag(a_tokens)
            exclude_tag = [
                "RBR", "JJS", "IN", "CD", "JJR", "NNP", "VBG", "MD", "CC",
                "VBD", "DT", "VBN", "VBZ"
            ]
            a_tagged_list = []
            [
                a_tagged_list.append(x[0]) for x in a_tokens_pos
                if x[1] not in exclude_tag
            ]

            query_final = ""
            for i in q_tagged_list:
                query_final += i + " "

            answer_final = ""
            for i in a_tagged_list:
                answer_final += i + " "

            vec1 = model.infer_vector(query_final.split())
            vec2 = model.infer_vector(answer_final.split())

            similairty = spatial.distance.cosine(vec1, vec2)

            if ((similairty >= 0.005 and similairty <= 0.006)
                    or (similairty >= 0.012 and similairty <= 0.022)
                    or (similairty >= 0.0561 and similairty <= 0.0568)):
                return "No"

            else:
                return "Yes"

        else:

            if actual_query.endswith("?"):
                actual_query = actual_query
            else:
                actual_query = actual_query + "?"

            result = predictor.predict(passage=new_text, question=actual_query)
            answer = result['best_span_str']
            similarity_value = []
            print(len(new_sentences))
            print('inside what questions : ')
            print(answer)
            for k in new_sentences:

                output_tokenize = word_tokenize(answer)
                k_tokenize = word_tokenize(k)

                sw = stopwords.words('english')
                l1 = []
                l2 = []

                output_set = {w for w in output_tokenize if not w in sw}
                k_set = {w for w in k_tokenize if not w in sw}

                rvector = output_set.union(k_set)
                for w in rvector:
                    if w in output_set: l1.append(1)  # create a vector
                    else: l1.append(0)
                    if w in k_set: l2.append(1)
                    else: l2.append(0)
                c = 0

                for i in range(len(rvector)):
                    c += l1[i] * l2[i]
                    cosine = c / float((sum(l1) * sum(l2))**0.5)

                similarity_value.append(cosine)

            print("Result : ")

            print(max(similarity_value))
            print(new_sentences[similarity_value.index(max(similarity_value))])

            answer = new_sentences[similarity_value.index(
                max(similarity_value))]

            return answer

    def datatype(query):

        Descriptive = [
            'what', 'which', 'who', 'whom', 'whose', 'why', 'where', 'how'
        ]
        Number = ['how much', 'how many', 'how old', 'how far']
        Time = ['when', 'how long']
        Boolean = ['is', 'does']
        secondary_word = [
            'profit', 'sum', 'mean', 'percentage', 'total', 'loss',
            'difference', 'age', 'average', 'maximum', 'minimum'
        ]

        query_words = word_tokenize(query)
        query_first_word = query_words[0]
        query_second_word = query_words[1]
        query_both_words = query_first_word + " " + query_second_word

        i = 0
        for w in query_words[1:]:
            if w in secondary_word:
                i += 1
        if query_first_word == 'what' and i > 0:
            ans_type = 'Numerical'
        elif query_both_words in Number:
            ans_type = 'Numerical'
        elif query_first_word in Time or query_both_words in Time:
            ans_type = 'Date/Time'
        elif query_first_word in Descriptive:
            ans_type = 'Text'
        elif query_first_word in Boolean:
            ans_type = 'Boolean'
        else:
            ans_type = 'Please enter valid question'

        return ans_type

    return main_query(query), get_topic(query), datatype(query)
 def process_doxc(self, files):
     self.name = file
     return process(str(files))
Example #20
0
docs_arr = documents.find({
    'file_name': {'$exists': True},
    'send_to_server': {'$exists': False},
    'file_extension': '.docx',
    'error': {'$exists': False}
}, no_cursor_timeout=True)

conn = pymysql.connect(host='176.112.205.12', user='******', password='******', db='referats', charset='utf8mb4')
cursor = conn.cursor()

counter = 1
for doc in docs_arr:
    try:
        file_path = FILES_DIR + doc['file_name']
        print(str(counter)+'. Sending... ' + doc['file_name'])
        text = docx2txt.process(file_path)
        sql = 'insert into documents (mongo_doc_id,title,content,file_name) values(%s, %s, %s, %s)'
        result = cursor.execute(sql, [
            str(doc['_id']),
            doc['title'],
            docx2txt.process(file_path),
            doc['file_name']
        ])
        conn.commit()
        documents.update_one({'_id': doc['_id']}, {'$set': {'send_to_server': True}})
        counter += 1
    except zipfile.BadZipFile:
        print('Error BadZipFile')
        documents.update_one({'_id': doc['_id']}, {'$set': {'error': 'zipfile.BadZipFile'}})
        counter += 1
        continue
Example #21
0
    pd.read_excel('Eamcet Key-converted.xlsx', sheet_name=1, usecols=[10, 11])
]
questions = {}
for dat in data:
    for index, row in dat.iterrows():
        questions[row[0]] = row[1]
# print(questions)

# document = Document('Eamcet Sravan Response Sheet-converted.docx')

# fullText = []
# for para in document.paragraphs:
#     fullText.append(para.text)
# print(fullText)

ques_text = docx2txt.process("filename.docx")
ques_list = ques_text.split()
# print(ques_list)

q_a = []
for i in range(len(ques_list)):
    if ('Question' in ques_list[i]):
        if (ques_list[i + 1] == 'Type'):
            continue
        q = int(ques_list[i + 3])

        ans = int(ques_list[i + 7])
        q_a.append([q, ans])

# print(q_a)
score = 0
Example #22
0
 def read_file(self):
     self.content = docx2txt.process(self.path)
Example #23
0
Citation examples used in the sample text (random authors and titles with random dates)
(Sabbagh, 2009)
(Sabbagh, n.d.)
(Sabbagh, 2010a)
(Sabbagh, 2010b)
(Qianyi Gu & Sumner, 2006)
(Despotovic-Zrakic et al., 2012)
(Anonymous, 2010)
(Anonymous, n.d.)
(“Barcelona to Ban Burqa,” 2010)
"""
import docx2txt as docx
import re

# Open the document
text = docx.process("lorem_sample.docx")
# Specifiy a very large number so that each `replace()` call catches\
# all stylized double quotes
num_replaces = 100000000
# Replace stylized doubled quotes by the default double quotes
# https://www.w3schools.com/charsets/ref_utf_punctuation.asp
text = text.replace('“', '"',
                    num_replaces).replace('”', '"', num_replaces).replace(
                        '„', '"',
                        num_replaces).replace('‟', '"', num_replaces)

# Text between double quotes: https://stackoverflow.com/a/378447/9263761
# Pattern to find all types of citations
pattern = r'\(([^"\)]*|\bAnonymous\b|"[^"\)]*")(, )([\d]+|n\.d\.|[\d]+[\w])\)'
"""
\( -> opening parentheses
    def main_query(query):
        actual_query = query
        query = query.replace('?', '')
        new_text = ""
        new_sentences = ""
        new1 = ""

        if ext == "docx":
            passage = docx2txt.process(file_name)
            sentences = re.split('\n', passage)
            new_text = ""
            for i in sentences:
                if i != "":
                    j = i.lstrip('0123456789. ')
                    if (len(j) != len(i)):
                        if new_text != "":
                            new_text = new_text + " " + j
                        else:
                            new_text = new_text + j
            new1 = new_text
            new_sentences = sent_tokenize(new_text)
            print('inside docx')

        elif ext == 'txt':
            passage = ""
            with open(file_name) as f:
                for line in f:
                    passage = passage + line
            sentences = re.split('\n', passage)
            new_text = ""
            print("Length of sentences generated :", len(sentences))
            for i in sentences:
                if i != "":
                    j = i.lstrip('0123456789. ')
                    if (len(j) != len(i)):
                        if new_text != "":
                            new_text = new_text + " " + j
                        else:
                            new_text = new_text + j

            new_sentences = sent_tokenize(new_text)
            print('inside txt')

        elif ext == 'pdf':
            text = ""
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            with open(file_name, 'rb') as fh:
                for page in PDFPage.get_pages(fh,
                                              caching=True,
                                              check_extractable=True):
                    page_interpreter.process_page(page)
                text = fake_file_handle.getvalue()
            converter.close()
            fake_file_handle.close()
            passage1 = text
            print("PDF")
            text_split = passage1.split()
            pdf_sent = ""
            for t in text_split:
                t = t.lstrip('0123456789. ')
                if t != "":
                    if pdf_sent == "":
                        pdf_sent = t + " " + pdf_sent
                    else:
                        pdf_sent = pdf_sent + " " + t

            print(pdf_sent)

            new_sentences = sent_tokenize(pdf_sent)
            print("PDF tokenize: ", len(new_sentences))

            new_text = ""
            for sent in new_sentences:
                new_text = sent + new_text

            print('inside pdf')

        elif ext == "xlsx":
            text = ""
            f = pd.ExcelFile(file_name)
            for names in f.sheet_names:
                sheet = pd.read_excel(f, names, header=None)
                for row in sheet.values:
                    for w in row:
                        w = w.lstrip('0123456789. ')
                        if text == "":
                            text = text + str(w)
                        else:
                            text = text + " " + str(w)

            new_text = text
            new_sentences = sent_tokenize(new_text)
            print("xlsx tokenize: ", len(new_sentences))
            print('inside excel')

        new2 = new_text
        print(new1 == new2)
        print(new_text)
        print(len(new_text))

        if query.startswith('is') or query.startswith('does'):

            result = predictor.predict(passage=new_text, question=query)
            answer = result['best_span_str']

            tokenized_doc = []

            for d in final_df["final_sentences"]:
                tokenized_doc.append(word_tokenize(d.lower()))

            tagged_data = [
                TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)
            ]

            model = Doc2Vec(tagged_data,
                            vector_size=20,
                            window=2,
                            min_count=1,
                            workers=4,
                            epochs=100)
            model.save("test_doc2vec.model")
            model = Doc2Vec.load("test_doc2vec.model")

            q_tokens = word_tokenize(query)
            q_tokens_pos = nltk.pos_tag(q_tokens)
            exclude_tag = [
                "RBR", "JJS", "IN", "CD", "JJR", "NNP", "VBG", "MD", "CC",
                "VBD", "DT", "VBN", "VBZ"
            ]
            q_tagged_list = []
            [
                q_tagged_list.append(x[0]) for x in q_tokens_pos
                if x[1] not in exclude_tag
            ]

            a_tokens = word_tokenize(answer)
            a_tokens_pos = nltk.pos_tag(a_tokens)
            exclude_tag = [
                "RBR", "JJS", "IN", "CD", "JJR", "NNP", "VBG", "MD", "CC",
                "VBD", "DT", "VBN", "VBZ"
            ]
            a_tagged_list = []
            [
                a_tagged_list.append(x[0]) for x in a_tokens_pos
                if x[1] not in exclude_tag
            ]

            query_final = ""
            for i in q_tagged_list:
                query_final += i + " "

            answer_final = ""
            for i in a_tagged_list:
                answer_final += i + " "

            vec1 = model.infer_vector(query_final.split())
            vec2 = model.infer_vector(answer_final.split())

            similairty = spatial.distance.cosine(vec1, vec2)

            if ((similairty >= 0.005 and similairty <= 0.006)
                    or (similairty >= 0.012 and similairty <= 0.022)
                    or (similairty >= 0.0561 and similairty <= 0.0568)):
                return "No"

            else:
                return "Yes"

        else:

            if actual_query.endswith("?"):
                actual_query = actual_query
            else:
                actual_query = actual_query + "?"

            result = predictor.predict(passage=new_text, question=actual_query)
            answer = result['best_span_str']
            similarity_value = []
            print(len(new_sentences))
            print('inside what questions : ')
            print(answer)
            for k in new_sentences:

                output_tokenize = word_tokenize(answer)
                k_tokenize = word_tokenize(k)

                sw = stopwords.words('english')
                l1 = []
                l2 = []

                output_set = {w for w in output_tokenize if not w in sw}
                k_set = {w for w in k_tokenize if not w in sw}

                rvector = output_set.union(k_set)
                for w in rvector:
                    if w in output_set: l1.append(1)  # create a vector
                    else: l1.append(0)
                    if w in k_set: l2.append(1)
                    else: l2.append(0)
                c = 0

                for i in range(len(rvector)):
                    c += l1[i] * l2[i]
                    cosine = c / float((sum(l1) * sum(l2))**0.5)

                similarity_value.append(cosine)

            print("Result : ")

            print(max(similarity_value))
            print(new_sentences[similarity_value.index(max(similarity_value))])

            answer = new_sentences[similarity_value.index(
                max(similarity_value))]

            return answer
Example #25
0
import docx2txt
import pandas as pd
import docx
from docx import Document
import nltk as nl
import pandas as pd
import numpy as np
my_text = docx2txt.process("sample1_cv.docx")
document = Document('sample1_cv.docx')
lst = pd.read_csv("repository.csv")
a = my_text.encode("ISO-8859-1", "ignore")
headings = []
tables = document.tables
for table in tables:
    for row in table.rows:
        for cell in row.cells:
            for paragraph in cell.paragraphs:
                headings.append(paragraph.text)
headings = [str(k) for k in headings]
sections = a.split('\n\n')
sec_no = 0
paras = ['']
for sec in sections:
    if sec in headings:
        sec_no += 1
        paras.append('')
        continue
    paras[sec_no] += sec + "\n"
import nltk
tokens = []
token_final = []
    parser.add_argument("--label_file_path",
                        type=str,
                        default="",
                        help="Path to the label data file")

    args = parser.parse_args()
    path = args.folder_path
    label_file = args.label_file_path
    if not os.path.exists(path):
        print("path does not exist: %s" % path)

    if not os.path.exists(label_file):
        print(" set proper label file path, path does not exists %s" %
              label_file)
        sys.exit(0)
    df = pd.read_csv(label_file)
    list_doc = glob.glob(path + "/*docx")
    new_label_data = []
    for file_path in list_doc:
        File_name = os.path.basename(file_path)[:-9]
        text = docx2txt.process(file_path)
        list_sentence = [x.replace("\n", " ") for x in sent_tokenize(text)]
        row = df.loc[df['File Name'] == File_name]
        label_data = trim_entity_spans(get_all_sentence(list_sentence, row))
        new_label_data += label_data

    if not os.path.exists('pickle_files'):
        os.makedirs('pickle_files')
    pickle_file = '_'.join(path.strip().split('/')[-1].split()) + '.pickle'
    with open('pickle_files/' + pickle_file, 'wb') as f:
        pickle.dump(new_label_data, f)
Example #27
0
def read_pc_doc_file(doc_file_name):
    try:
        text = docx2txt.process(doc_file_name)
        print 'Doc file %s found'%(doc_file_name)
    except:
        print 'No Doc file %s found'%(doc_file_name)
        return []

    index = 0
    start_text = 'The said goods may be allowed to be cleared at NIL rate of duty in terms of Notification 52/2003-Cus dated 31.03.2006, under intimation to the undersigned.'
    end_text = 'OFFICE OF THE SUPERINTENDENT OF CENTRAL EXCISE, WAGHOLI'
    address_list = []
    while index < len(text):
        start = text.find(start_text, index) + len(start_text)
        end = text.find(end_text, start)
        if end >=0:
            address_text = text[start:end]
            address_list.append(text[start:end].strip().replace('\n\n', '\n'))
            index = end
        else:
            address_text = text[start:]
            address_list.append(text[start:].strip().replace('\n\n', '\n'))
            index = len(text)


    index = 0
    start_text = 'Balance in B-17 Bond Rs.'
    end_text = 'The said goods may be allowed to be cleared at NIL rate of duty in terms of Notification 52/2003-Cus dated 31.03.2006, under intimation to the undersigned.'
    balance_list = []
    # print text
    while index < len(text):
        start = text.find(start_text, index) + len(start_text)
        if start < len(start_text):
            break
        end = text.find(end_text, start)
        balance_text = text[start:end].strip()
        balance_list.append(balance_text)
        index = end

    index = 0
    start_text = 'This registration authorizes them to obtain/clear material from'
    end_text = 'for re'
    short_address_list = []
    # print text
    while index < len(text):
        start = text.find(start_text, index) + len(start_text)
        if start < len(start_text):
            break
        end = text.find(end_text, start)
        short_address_text = text[start:end].strip()
        short_address_list.append(short_address_text)
        index = end

    document = docx.Document(doc_file_name)
    table_list = []
    tables = document.tables
    for i in range(1,len(tables), 2):
        table = tables[i]
        table_list.append({})
        table_list[-1]['sr_no'] = get_text_from_cell(table.rows[1].cells[0])
        table_list[-1]['description_of_goods'] = get_text_from_cell(table.rows[1].cells[1])
        table_list[-1]['qty'] = get_text_from_cell(table.rows[1].cells[2])
        table_list[-1]['cif_value'] = get_text_from_cell(table.rows[1].cells[3])
        table_list[-1]['total'] = get_text_from_cell(table.rows[2].cells[3])
        table_list[-1]['address'] = address_list[i/2]
        table_list[-1]['balance'] = balance_list[i/2]
        table_list[-1]['short_address'] = short_address_list[i/2]

    return table_list
Example #28
0
import os
import docx2txt
import time
os.chdir('C:\\Users\\vprakas\\Desktop\\python\\kpi')
a = os.listdir('C:\\Users\\vprakas\\Desktop\\python\\kpi')
#fullText=[]
for i in a:
    try:
        b = i.split("_")
        cc = b[0]
        print(cc)
        text = docx2txt.process(i)
        b = text.splitlines()
        #print(b)
        for x in range(len(b)):
            if "Maintenance Window" in b[x]:
                changetime = str(b[x + 2:x + 3])
                print(changetime)
            if "Affected Devices" in b[x]:
                for z in b[x:x + 2]:
                    q = z
                    servername = q.replace("Affected Devices:", "")
                    print(servername)
        print("*" * 100)
    except:
        pass
def wordextract(file):
    text = docx2txt.process(file)
    return text

def plot():
    plt.ylabel('Noun')
    plt.xlabel('Frequency')
    plt.xticks(rotation='vertical')
    plt.title('Noun vs Frequency')
    plt.hist(sorted_list, rwidth=0.85, bins=len(counter))
    plt.show()


Filename = input('Enter File Path:\n')

# If file is in docx format, it is converted to txt before being processed for noun extraction

if Filename.endswith('.docx'):
    lines = docx2txt.process(Filename)

    noun_extraction()
    getfrequency()
    plot()

else:

    File = open(Filename)
    lines = File.read()

    noun_extraction()
    getfrequency()
    plot()
    wcs = doc.BuiltInDocumentProperties("Number of Words")
    wordcounts.append(str(wcs))
    lastsave = doc.BuiltInDocumentProperties("Creation Date")
    lastsavel.append(str(lastsave))
    doc.Close()

print("Authors found!")
print("Wordcounts done!")
print("Creation dates found!")

#now to convert to plain text
editednobr = []
originalnobr = []

for i in efilenames:
    edited = docx2txt.process(i)
    edited = edited.replace('\r', '').replace('\n', '')
    editednobr.append(edited)

for i in ofilenames:
    original = docx2txt.process(i)
    original = original.replace('\r', '').replace('\n', '')
    originalnobr.append(original)

#split into sections -- ORIGINAL
headingol = []
backdigestol = []
argumentsol = []
notesol = []

for i in originalnobr:
Example #32
0
# -*- coding: utf-8 -*-
import requests
import xml.etree.ElementTree as ET
import codecs
import docx2txt
import numpy

import codecs


a=docx2txt.process("TextInput.docx")

a=a.replace(u'“', '')
a=a.replace(u'”', '')


request = u"""<?xml version="1.0" encoding="utf-8"?>
<soapenv:Envelope xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/" xmlns:web="http://webFdgRo.uaic/">
   <soapenv:Header/>
   <soapenv:Body>
      <web:parseText>
         <txt>""" + a + u"""</txt>
      </web:parseText>
   </soapenv:Body>
</soapenv:Envelope>"""

encoded_request = request.encode('utf-8')

headers = {"Host": "www.webFdgRo.uaic",
           "Content-Type": "text/xml; charset=UTF-8",
           "Content-Length": str(len(encoded_request))}
Example #33
0
        print(row)

import json
with open("file.json") as file:
    content = json.load(file)

content.get("bişi..")

import xml.etree.ElementTree as ET
content = ET.parse("jile.xml")
root = content.getroot()
for child in root:
    print(child.tag, child.text)

import docx2txt
content = docx2txt.process("file.docx")
print(content)

from PyPDF2 import PdfFileReader
file = open("file.pdf", "rb")
content = PdfFileReader(file)
text = content.getPage(0)
text.extractText()

content.numPages

#%% Site engelleme

import time
from datetime import datetime as dt
Example #34
0
import docx2txt
import re
import os
base_folder = './docs';

documents = os.listdir(base_folder)
filtered_docs=[]
text=''
ending_with_docs= ['docx','doc']

print "Started parsing documents........"
for doc in documents:
    if doc.split(".")[1] == ending_with_docs[0].lower() or doc.split(".")[1]==ending_with_docs[1].lower():
        filtered_docs.append(doc)
        text=text+docx2txt.process('./docs/'+doc)
    else:
        print "Invalid file(Not doc/docx)"+doc
print "Finished parsing documents"
regex_email=re.findall(r"[\w\S]+[@]\w+[.]\w{3,}",text)
print 'Emails are:'
print regex_email


Example #35
0
def get_info(fileDir, files_array):
    while True:
        try:
            #print(os.path.exists(fileDir, fileDir))
            #files_array = [_ for _ in os.listdir(fileDir) if _.endswith(fileExt)]
            indice = 0
            image_array = []
            table_chunks = []
            table_return = []

            for file in files_array:
                #
                # raspagem de numero de página do arquivo word ( x.group()) Dados dentro do arquivo app.xml
                #

                pagina_xml_code = subprocess.Popen(
                    ["unzip", "-p", fileDir + file, "docProps/app.xml"],
                    stdout=subprocess.PIPE)
                output = pagina_xml_code.communicate()[0]
                #print(output)
                pagina_xml_str = output.decode("utf-8")
                x = re.search('(?<=\<Pages\>)(.*)(?=\<\/Pages\>)',
                              pagina_xml_str)
                # print(file)

                #try:
                #    pagina_xml = x.group()
                #except ValueError:
                #    print("Oops! XML não encontrado")

                try:
                    pagina_xml = x.group()
                except:
                    print("Oops! XML não encontrado para file: " + file)
                    continue
                #print(pagina_xml)

                # abrir conecção com o arquivo Word
                #doc = docx.Document(fileDir+'/'+file)
                doc = docx.Document(fileDir + file)

                # data criação
                dt_doc = doc.core_properties.created
                # ler em cada parágrafo dentro do arquivo Word
                paragra = [p.text for p in doc.paragraphs]

                # extai a quantidade de caracteres
                #caracteres = docx2txt.process(fileDir+'/'+file)
                content = docx2txt.process(fileDir + file)
                #### novo codigo
                caracteres = []
                for line in content.splitlines():
                    #This will ignore empty/blank lines.
                    if line != '':
                        #Append to list
                        caracteres.append(len(line))

                # fim novo codigo
                #### novo codigo
                # pegando quantidade de estilos que o docuento word possui
                # colocando na variável (array_styles)
                from docx.enum.style import WD_STYLE_TYPE
                styles = doc.styles
                array_styles = []
                paragraph_styles = [
                    s for s in styles if s.type == WD_STYLE_TYPE.PARAGRAPH
                ]
                for style in paragraph_styles:
                    array_styles.append(style.name)
                    #print(style.name)
                #print(len(array_styles))
                # fim novo codigo

                # gerar um array com tamanho de todas as imagens
                for image in doc.inline_shapes:
                    image_array.append([image.width, image.height])
                    # print (image.width, image.height)

                # gerar um array com todas as tabelas
                for table in doc.tables:
                    table_chunks.append(table)

                # gerar lista resposta
                list_retur = file[2:8], pagina_xml, sum(caracteres), len(
                    table_chunks), len(image_array), len(
                        array_styles), dt_doc.date()

                # gerar tabela resposta
                table_return.append(list_retur)
            #return(files_array[indice][2:8],len(caracteres), len(table_chunks), len(image_array),dt_doc.date())
            return table_return
            #break
        except ValueError:
            print(
                "Oops!  erro no método verificar diret'ório e extensão do arquivos no argumento desse método"
            )
Example #36
0
 def convertDocx(self, filename):
     rawtext = docx2txt.process(filename)
     f = open(filename[:-5] + '_processed.txt', 'w')
     f.write(rawtext)
     f.close()
regex1 = re.compile(
    r"\b(?:Question 1: What do you think are the current challenges to sustainable development in the (Mekong Lancang|MekongLancang) region\?)(?P<Answer1>.*?)(?:Question 2: What does regional cooperation mean to you\? What are the opportunities for regional cooperation to support sustainable development in the Mekong- Lancang\?)(?P<Answer2>.*?)(?:Question 3: From your experience, are there examples where some or all of the Mekong-Lancang countries have cooperated to yield a clear and positive trans-boundary river management outcome\?)(?P<Answer3>.*?)(?:Question 4: What are the relative advantages\/merits of the different mechanisms for cooperation, and do you see any opportunities for improvements\?)(?P<Answer4>.*?)(?:Question 5: In your opinion, when cooperation occurs between Lancang-Mekong countries, what indicates its success\? How do you know if cooperation is successful\?)(?P<Answer5>.*?)(?:Question 6: From your experience, for what types of Lancang-Mekong problems has cooperation been most effective\?)(?P<Answer6>.*?)(?:Question 7: In your view, which factors prevent cooperation\? And which factors enable it\?)(?P<Answer7>.*?)(?:Question 8: From your experience, when Lancang-Mekong countries cooperate for sustainable development of the basin, who are the most influential actors\?)(?P<Answer8>.*?)(?:Question 9: In your opinion, how can governments balance natural resources sustainability with economic development goals\?)(?P<Answer9>.*)\b"
)
regex2 = re.compile(
    r"Interview with (?P<Name>.*?,)(?P<Org>.*?,)(?P<Country>.*?,)")
columns = [
    "File Name", "Answer1", "Answer2", "Answer3", "Answer4", "Answer5",
    "Answer6", "Answer7", "Answer8", "Answer9"
]
df = pd.DataFrame(columns=columns)
#%%
file_data = {}
index = 0

for file in onlyfiles:
    my_text = docx2txt.process(file)
    shorted_file_name = re.sub(".docx", "", file)
    text_file_name = shorted_file_name + ".txt"
    print(text_file_name)
    with open(text_file_name, "w", encoding='utf-8') as text_file:
        print(my_text, file=text_file)
    f = open(text_file_name, 'r', encoding='utf-8')
    content = f.readlines()
    new_list = []
    for element in content:
        element = re.sub(r'(\s+\\n)|(\\n)', '', element.strip(
        ))  # this is the line we add to strip the newline character
        new_list.append(element)
    new_string = ([
        element.replace("\\u00a0", " ").encode('ascii', 'ignore').decode()
        for element in new_list if element != ""
Example #38
0
def build_index_text_docx(file_name):
    return docx2txt.process(file_name).replace('\n\n', ' ')
def import_convert_preprocess(url, extension):
    global doc_count
    global crawled_web_dir_preprocessed
    global crawled_web_dir
    global crawled_web_dir_conv_need
    global page_doc_map
    url_map_name = url

    if (url_map_name not in page_doc_map):
        page_doc_map[url_map_name] = -1
        page_ref_count[url_map_name] = 1

        try:
            doc_count_temp = doc_count + 1
            book_name = ""
            if extension == "pdf":
                book_name = str(doc_count_temp) + ".pdf"
            elif extension == "docx":
                book_name = str(doc_count_temp) + ".docx"
            elif extension == "pptx":
                book_name = str(doc_count_temp) + ".pptx"

            book_path = crawled_web_dir_conv_need + "\\" + book_name

            a = requests.get(url, stream=True)

            with open(book_path, 'wb') as book:
                for block in a.iter_content(512):
                    if not block:
                        break
                    book.write(block)

            book.close()

            file_name = str(doc_count_temp) + ".txt"
            file_path = crawled_web_dir + "\\" + file_name
            is_valid_for_indexing = 555
            if extension == "pdf":
                pdf_to_text(book_path, file_name)
                is_valid_for_indexing = preprocess_one_doc_from_pdf(
                    crawled_web_dir, file_name, crawled_web_dir_preprocessed)

            elif extension == "docx":
                text = docx2txt.process(book_path)
                save_text(text, crawled_web_dir, file_name)
                is_valid_for_indexing = preprocess_one_doc(
                    crawled_web_dir, file_name, crawled_web_dir_preprocessed)

            elif extension == "pptx":
                text = pptx_to_text(book_path)
                save_text(text, crawled_web_dir, file_name)
                is_valid_for_indexing = preprocess_one_doc(
                    crawled_web_dir, file_name, crawled_web_dir_preprocessed)

            if (is_valid_for_indexing == 1):
                doc_count = doc_count + 1
                page_doc_map[url_map_name] = doc_count
                doc_page_map[doc_count] = url_map_name
                page_ref_count[url_map_name] = 1
            else:
                delete_file(book_path)
                delete_file(file_path)
                page_doc_map[url_map_name] = -2

        except IOError:
            page_doc_map[url_map_name] = -1
    else:
        page_ref_count[url_map_name] = page_ref_count[url_map_name] + 1
Example #40
0
def index(request):

    djtext1 = request.POST['t1']
    djtext2 = request.POST['t2']
    state1 = request.POST.get('x1', 'off')
    state2 = request.POST.get('x2', 'off')
    flag = 0
    m1 = len(djtext1)
    m2 = len(djtext2)
    if m1 == 18 and m2 == 18:
        flag = 1
    if state1 == "on" or state2 == "on":
        if state1 == "on":
            url1 = request.POST.get('u1')
            r1 = requests.get(url1)
            htmlcontent1 = r1.content
            soup1 = BeautifulSoup(htmlcontent1, 'html.parser')
            link = soup1.find('td').get_text()
            # link=link[:4000]

        if state2 == "on":
            url2 = request.POST.get('u2')
            r2 = requests.get(url2)
            htmlcontent2 = r2.content
            soup2 = BeautifulSoup(htmlcontent2, 'html.parser')
            link = soup2.find('article').get_text()
            # link=link[:4000]
        if m1 != 18:
            #common text list
            common=""
            matches = difflib.SequenceMatcher(
                  None, djtext1, link).get_matching_blocks()
            for match in matches:
                  #print (t1[match.a:match.a + match.size])
                  common+=djtext1[match.a:match.a + match.size]
            seq = difflib.SequenceMatcher(None, djtext1, common)
            d = seq.ratio()*100
            d = round(d, 2)
            x = ""
            x = str(d)
            
           
            # if common==djtext1 or common==link:
            #     x="100"
            report(x,djtext1,link,common)
            
            params = {'text1': djtext1, 'text2': link,
                      'res': flag, 'len1': m1, 'len2': m2, 'ans': x,'com':common}
            return render(request, 'result.html', params)
        elif request.method == 'POST':

            bfile1 = request.FILES['f1']
            ext1 = bfile1.name
            if ext1[-1] == 'x':
                z = docx2txt.process(bfile1)
            elif ext1[-1] == 'f':
                x1 = PyPDF2.PdfFileReader(bfile1)
                z = " "
                num = x1.getNumPages()
                for i in range(1, num):
                    z += x1.getPage(i).extractText()
            seq = difflib.SequenceMatcher(None, link, z)
            d = seq.ratio()*100
            d = round(d, 2)
            x = " "
            x = str(d)
            common=""
            matches = difflib.SequenceMatcher(
                     None, link, z).get_matching_blocks()
            for match in matches:
                 common+=link[match.a:match.a + match.size]
           
            report(x,link,z,common)
           
            params = {'text1': z, 'text2': link, 'res': flag,
                      'len1': m1, 'len2': m2, 'ans': x,'com':common}
            return render(request, 'result.html', params)

    elif request.method == 'POST' and flag == 1:

        bfile1 = request.FILES['f1']
        bfile2 = request.FILES['f2']

        ext1 = bfile1.name
        ext2 = bfile2.name

        if ext1[-1] == 'x':
            z = docx2txt.process(bfile1)
        elif ext1[-1] == 'f':
            x1 = PyPDF2.PdfFileReader(bfile1)
            z = " "
            num = x1.getNumPages()
            for i in range(1, num):
                z += x1.getPage(i).extractText()

        if ext2[-1] == 'x':
            y = docx2txt.process(bfile2)

        elif ext2[-1] == 'f':
            x2 = PyPDF2.PdfFileReader(bfile2)
            y = " "
            num = x2.getNumPages()
            for i in range(1, num):
                y += x2.getPage(i).extractText()
        seq = difflib.SequenceMatcher(None, y, z)
        d = seq.ratio()*100
        d = round(d, 2)
        x = " "
        x = str(d)
        common=""
        matches = difflib.SequenceMatcher(
                     None, y, z).get_matching_blocks()
        for match in matches:
                 common+=y[match.a:match.a + match.size]
        if common==y or common==z:
            x="100"
        report(x,y,z,common)
        params = {'text1': z, 'text2': y, 'res': flag,
                  'len1': m1, 'len2': m2, 'ans': x,'com':common}
        return render(request, 'result.html', params)
    else:
        common=""
        matches = difflib.SequenceMatcher(
                     None, djtext1, djtext2).get_matching_blocks()
        for match in matches:
                 common+=djtext1[match.a:match.a + match.size]
        seq = difflib.SequenceMatcher(None, djtext1, djtext2)
        d = seq.ratio()*100
        d = round(d, 2)
        x = ""
        x = str(d)
       
        # if common==djtext1 or common==djtext2:
        #     x="100"
               
        report(x,djtext1,djtext2,common)
        params = {'text1': djtext1, 'text2': djtext2,
                  'res': flag, 'len1': m1, 'len2': m2, 'ans': x,'com':common}
        return render(request, 'result.html', params)
Example #41
0
import docx2txt
import os


def ensure_dir(f):
    d = os.path.dirname(f)
    if not os.path.exists(d):
        os.makedirs(d)


basedir = "Relatos Esquizofrenia/"

file = "Relatos Esquizofrenia/Relato A/Control/1.a.docx"
test = docx2txt.process(file)

for subdir, dirs, files in os.walk(basedir):
    for file in files:

        filedir = os.path.join(subdir, file)
        output = docx2txt.process(filedir)
        split = filedir.split("/")[1]
        split = split.split("\\")
        print split
        ensure_dir("txts\\" + split[0] + "\\")
        f = open(
            "txts\\" + split[0] + "\\" + split[1] + split[2].split('.')[0] +
            ".txt", "w")
        f.write(output.encode("utf8"))
        f.close()
Example #42
0
#Takes in the text and runs all the functions required for creating a summary from it.
def run_summarization(text):
    # 1 Create the word frequency table
    freq_table = _create_frequency_table(text)
    '''
    We already have a sentence tokenizer, so we just need 
    to run the sent_tokenize() method to create the array of sentences.
    '''

    # 2 Tokenize the sentences
    sentences = sent_tokenize(text)

    # 3 Important Algorithm: score the sentences
    sentence_scores = _score_sentences(sentences, freq_table)

    # 4 Find the threshold
    threshold = _find_average_score(sentence_scores)

    # 5 Important Algorithm: Generate the summary
    summary = _generate_summary(sentences, sentence_scores, 1.5 * threshold)

    return summary


#In main, start running summarization on the document specified.
if __name__ == '__main__':
    document = docx2txt.process(
        "(Edited) Copy of Sources_Sought_Trainer_LFTS.docx")
    result = run_summarization(document)
    print(result)
Example #43
0
import os
from gtts import gTTS
import docx2txt

# pip install gTTS
# pip install docx2txt

# to play, put this file in the current dir.

#word to text convertor replace docfilename

filename = "COVID19anditsimpactonsports"
MY_TEXT = docx2txt.process(filename + ".docx")

with open(filename + ".txt", "w") as text_file:
    print(MY_TEXT, file=text_file)

# You will need a text file named test.txt

FLIST = open(filename + ".txt", "r").read().replace("\n", " ")

print("please wait...processing")
TTS = gTTS(text=str(FLIST), lang='en-us')

# Save to mp3 in current dir.
TTS.save(filename + ".mp3")

# Plays the mp3 using the default app on your system
# that is linked to mp3s.
print("Process Done Now File has store in your current dir.")
os.system(filename + ".mp3")
import nltk
import docx2txt

filename= "GC.docx"
text = docx2txt.process(filename)

def unusual_words(text):
	text_vocab = set(w.lower() for w in text if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    unusual = text_vocab - english_vocab
    return sorted(unusual)

token_list = nltk.word_tokenize(text)

stopwords_removed = [word for word in token_list if word not in stopwords.words('english')]

def filter_duplicates(text_list):
	output_list = []
	for word in text_list:
		if word not in output_list:
			output_list.append(word)
	return output_list


from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
stemmed = []
for word in filtered:
		stemmed.append(st.stem(word))

"""
Example #45
0
def getDocxContent(filename):
    DocxText = docx2txt.process(filename)
    return DocxText
Example #46
0
  folder = int(hierachy[-1])
  #print(files)

  for filename in files:

    #if file_counter > 100:
      #break

    tu = []

    '''filepath for each file'''
    f = root +"/"+ filename

    #print(file)
    '''read file content'''
    text_dataframe = d2t.process(f)

    if len(text_dataframe) > mask:
      text_dataframe = text_dataframe[:mask]
    elif len(text_dataframe) < mask:
      continue

    input_ids = torch.tensor(tokenizer.encode(text_dataframe)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)

    last_hidden_states, mems, hidden_states = outputs

    #print(len(last_hidden_states))
    #print(len(last_hidden_states[0]))
    #print(len(last_hidden_states[0][0]))
sheets = workbook.sheet_names()
required_data = []
for sheet_name in sheets:
    sh = workbook.sheet_by_name(sheet_name)
    for rownum in range(sh.nrows):
        row_values = sh.row_values(rownum)
        required_data.append((row_values[4]))
required_data2 = []
for sheet_name in sheets:
    sh = workbook.sheet_by_name(sheet_name)
    for rownum in range(sh.nrows):
        row_values = sh.row_values(rownum)
        required_data2.append((row_values[5]))
required_data1 = list(filter(None, required_data))
z = os.getcwd()
text = docx2txt.process(z + "\\SampleInputDoc1-FAQs.docx")
blob = TextBlob(text)
tokenizer = BlanklineTokenizer()
z = blob.tokenize(tokenizer)
c = '?'
lst = list()
for i in range(0, len(z)):
    x = z[i].find(c)
    if x != -1:
        lst.append(z[i])

import xlsxwriter

workbook = xlsxwriter.Workbook('SampleOutput.xlsx')
worksheet = workbook.add_worksheet()
 def convert_docx_to_txt(self, path):
     # https://github.com/ankushshah89/python-docx2txt
     # Very simple setup of python-docx to text
     text = docx2txt.process(path)
     return unicode(text)
Example #49
0
Trying to figure out some stuff relating to capitalization and the feasibility of using it
to help determine if it will work in our program


The usability seems to be limited insofar as it is too vague. While in conjunction with another form of
search it may work, as most words that are capitalized are the requesting entities, the headings in a 
specific section, among other things.
"""

import docx2txt
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

#The name of the file we wish to process
document1 = docx2txt.process("Copy of Sources Sought Synopsis Manuals 8 Jan 2020.docx")


stopwords = list(STOP_WORDS)

len(stopwords)

#Load the medium spacy package so there is at least vectors for calculating similarity
nlp = spacy.load('en_core_web_md')

docx = nlp(document1)

#An array that will hold the important words, IE those capitalized
importantWords = []
for word in docx:
	if word.text not in stopwords:
Example #50
0
def docx_2_txt(file):
    print(os.path.abspath(file))
    text = docx2txt.process(file)
    with open('{}.txt'.format(file.split('.')[0]), 'w') as fl:
        fl.write(text)
import docx2txt
my_text = docx2txt.process("/home/narendra/Desktop/leather fact.docx")
print(my_text)
Example #52
0
def read_txt(doc):
    read1 = docx2txt.process(doc)
    read = read1.splitlines()  # split document to lists on new line
    read = [x for x in read if not x.isdigit()]  # remove number from words
    read = [x for x in read if x]  # remove empty list
    return read
Example #53
0
def convert_docx_to_txt(path):
    return docx2txt.process(path)
Example #54
0
def docx_reader(file):
    text = docx2txt.process(open(file, 'rb'))
    # print(text)
    return text
Example #55
0
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import docx2txt
my_text = docx2txt.process("Basic inbuilt commands.docx")
print(my_text)


# In[3]:


# filter()

# what filter function do ? 

# filter(function, iterable)

# function:   A Function to be run for each item in the iterable
# iterable:   The iterable to be filtered

# for example: you define name = [xvc, ohg, ger, gea, y4eg,] then name is iterable


# In[50]:


x = float("3.5000000")
x
Example #56
0
def extract_text_from_docx(docx_path):
    txt = docx2txt.process(docx_path)
    if txt:
        return txt.replace('\t', ' ')
    return None
Example #57
0
 def extract(self, filename, **kwargs):
     return docx2txt.process(filename)