Exemple #1
0
  def post(self, request, format=None):
    # tipo_analisis = request.POST['tipo_analisis']
    uploaded_file = request.FILES['file']
    file_name = uploaded_file.name
    file_extension = file_name.split(".")[1]

    destination = open('backendFondecyt/Docs/' + file_name, 'wb+')
    for chunk in uploaded_file.chunks():
      destination.write(chunk)
    destination.close()

    if (file_extension == "doc"):
      file_name = self.converDocToDocx(file_name)
    if (file_extension == "doc" or file_extension == "docx"):
      with open('backendFondecyt/Docs/' + file_name, "rb") as docx_file:
        rawText = mammoth.extract_raw_text(docx_file).value
        html = mammoth.convert_to_html(docx_file).value
    if (file_extension == "txt"):
      txt_file = open('backendFondecyt/Docs/' + file_name, "r", encoding="utf-8")
      rawText = txt_file.read()
      html = ""
      for line in txt_file:
        stripped_line = line.rstrip()
        if (stripped_line.strip() != ""): 
          html += "<p>" + line + "</p>"
      txt_file.close()
    
    payload = {'texto': rawText, 'html': html}
    data = requests.post('http://redilegra.com/general', data=payload)
    data = json.loads(data.text.encode('utf8'))
    os.remove('backendFondecyt/Docs/' + file_name)
    return Response(data, status.HTTP_201_CREATED)
 def compose_request(self):
     html_template = Template(
         mammoth.convert_to_html(self.template_path).value)
     context = Context(self.request_information)
     text_template = Template(
         mammoth.extract_raw_text(self.template_path).value)
     unprocessed_text = text_template.render(context)
     # in order to allow for line spacing, need br and p coming from Django context render
     text_content = re.sub(r"\<\/?p\>", "",
                           re.sub(r"\<br\>", "\n", unprocessed_text))
     return html_template.render(context), text_template.render(context)
def extract_from_mammoth(source_dir, file, target_format):

    import mammoth
    with open(os.path.join(source_dir, file), 'rb') as f:
        if target_format == "html":
            document = mammoth.convert_to_html(f)
        else:
            document = mammoth.extract_raw_text(f)
        text = document.value
    #print(text)
    return [text]
 def extract_content_from_document(self, filename):
     """ Extract content from a .docx file and return a (text, html) tuple.
     """
     ext = os.path.splitext(filename)[1]
     if ext == '.docx':
         with open(filename, "rb") as f:
             html = mammoth.convert_to_html(f).value
             text = mammoth.extract_raw_text(f).value
         return (text, html)
     else:
         # TODO: handle .doc
         raise ValueError("Can only handle .docx files, but got %s" % ext)
def validate_template_extension(foia_template):
    _validate_extension(foia_template, VALID_FOIA_EXTENSIONS)
    # Make sure we'll have no problem reading or submitting this request
    # https://stackoverflow.com/questions/2472422/django-file-upload-size-limit -- 3rd answer
    limit = 2 * 1024 * 1024
    if foia_template.size > limit:
        raise ValidationError("File too large. Size should not exceed 2 MiB.")
    record_regex = re.compile(r"{{\s*requested_records(?:\s*\|\s*\w+)\s*}}")
    document_text = mammoth.extract_raw_text(foia_template).value
    if not record_regex.search(document_text):
        raise ValidationError(
            "You need to place '{{requested_records}}' somewhere in the body of your template"
        )
Exemple #6
0
def readFile(p, i):
    mergedLines = []

    if (i.endswith(".pdf")):
        # creating a pdf File object of original pdf
        pdfFileObj = open(path.join(p, i), 'rb')

        # creating a pdf Reader object
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

        for page in range(
                pdfReader.numPages):  # add all page text to merged text
            # creating page object
            pageObj = pdfReader.getPage(page)
            # add pdf's text to text
            text = pageObj.extractText()
            # add lines
            lines = [text]
            lines = re.sub(
                "[^\w]", " ",
                lines[0]).split()  # change unnecessary symbols to null
            mergedLines = mergedLines + lines
        pdfFileObj.close()
    elif (i.endswith(".txt")):
        txtFile = open(path.join(p, i), "r")
        lines = txtFile.readlines()
        lines = re.sub("[^\w]", " ",
                       lines[0]).split()  # change unnecessary symbols to null
        mergedLines = mergedLines + lines
    elif (i.endswith(".docx")):
        with open(path.join(p, i), "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            text = result.value
            lines = [text]
            lines = re.sub(
                "[^\w]", " ",
                lines[0]).split()  # change unnecessary symbols to null
            mergedLines = mergedLines + lines

    mergedLines = [i.lower() for i in mergedLines
                   ]  # Convert all characters to lowercase before tokenizing.
    mergedLines = [
        ''.join(c for c in s if c not in string.punctuation)
        for s in mergedLines
    ]  # Tokenizing our text
    mergedLines = [i for i in mergedLines
                   if i not in stop]  # Add all word if it is not stop word.
    #print(mergedLines)
    return mergedLines  # return processed text
Exemple #7
0
def upload_version(request, contract_id):
    if request.method == 'POST':
        form = VersionForm(request.POST, request.FILES)

        if form.is_valid():
            obj = form.save(commit=False)

            # process the form data to extract the word document text
            obj.text = extract_raw_text(obj.file).value
            obj.contract = Contract.objects.get(pk=contract_id)
            obj.uploaded_by = get_first_name()
            obj.save()

            make_amendments(obj.contract, obj)

    return HttpResponseRedirect('/contracts/view/' + str(contract_id))
def read_and_clean_file(file_name, destination_path, stop_words):
    text = []

    if file_name.endswith(".txt"):
        with open(path.join(destination_path, file_name), 'r') as text_file:
            # iterate through raw list and use regex to clean non-alphanumeric symbols
            raw_line = text_file.readline()
            while raw_line:
                clean_line = re.sub("[^a-zA-Z_şŞğĞüÜİöÖçÇı]", " ",
                                    raw_line).split()
                text += clean_line
                raw_line = text_file.readline()

    elif file_name.endswith(".docx"):
        with open(path.join(destination_path, file_name), 'rb') as docx_file:
            # use mammoth package to convert docx to text
            raw_content = [mammoth.extract_raw_text(docx_file).value]
            # use regex to to clean non-alphanumeric symbols
            clean_content = re.sub("[^a-zA-Z_şŞğĞüÜİöÖçÇı]", " ",
                                   raw_content[0]).split()
            text += clean_content

    elif file_name.endswith(".pdf"):
        with open(path.join(destination_path, file_name), 'rb') as pdf_file:
            # use PyPDF2 package to convert pdf to text
            pdf_reader = PyPDF2.PdfFileReader(pdf_file)
            # iterate through pages and use regex to clean non-alphanumeric symbols
            for page in range(0, pdf_reader.numPages):
                raw_content = [pdf_reader.getPage(page).extractText()]
                clean_content = re.sub("[^a-zA-Z_şŞğĞüÜİöÖçÇı]", " ",
                                       raw_content[0]).split()
                text += clean_content

    # convert letters to lower forms
    text = [letter.lower() for letter in text]
    # generate stop words list
    stop_words_list = set(stopwords.words('english'))
    for stop_word in stop_words:
        stop_words_list.add(stop_word)
    # clean stop words and punctuations
    keywords = [
        word for word in text
        if word not in stop_words_list and word not in string.punctuation
    ]

    return keywords
Exemple #9
0
    def process_data(self, directory):

        for f in os.listdir(directory):
            with open('{}/{}'.format(directory, f), 'rb') as flav:
                print("Processing file ---> {}".format(os.path.basename(f)))
                try:
                    # result = mammoth.convert_to_html(flav)
                    # doc_results = result.value
                    result = mammoth.extract_raw_text(flav)
                    doc_results = result.value
                    results = self.denoise_text(doc_results)
                    meta = self.get_POS(results)
                    status = self.create_dataset(meta)
                except Exception as e:
                    print(e)
                    continue

        model = models.Word2Vec(all_sentences, min_count=1)

        model.save_word2vec_format("models/word2vec.bin")
Exemple #10
0
def index(request):
    if request.method == 'POST':
        form = ContractTemplateForm(request.POST, request.FILES)

        if form.is_valid():
            obj = form.save(commit=False)

            # process the form data to extract the word document text

            obj.text_content = extract_raw_text(obj.original_file).value
            obj.save()

            return HttpResponseRedirect('/templates')
    else:
        form = ContractTemplateForm()

    categories = ContractCategory.objects.all()
    context = {'categories': categories, 'form': form}

    return render(request, 'templateStorage/index.html', context)
def readFilesInDirAndCleanWordList(stop):
    dire = path.dirname(__file__)
    p = os.path.join(dire, 'text files/')

    mergedLines = []

    for i in os.listdir(os.path.join(dire, 'text files')):
        if (i.endswith('.txt')):
            file = open(path.join(p, i), "r")
            lines = file.readlines()
            lines = re.sub("[^\w]", " ", lines[0]).split()
            mergedLines = mergedLines + lines
        elif (i.endswith('.docx')):
            with open(path.join(p, i), "rb") as docx_file:
                result = mammoth.extract_raw_text(docx_file)
                text = result.value
                lines = [text]
                lines = re.sub("[^\w]", " ", lines[0]).split()
                mergedLines = mergedLines + lines
        elif (i.endswith('.pdf')):
            pdfFileObj = open(path.join(p, i), 'rb')
            pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
            for i in range(0, pdfReader.numPages):
                pageObj = pdfReader.getPage(i)
                text = pageObj.extractText()
                lines = [text]
                lines = re.sub("[^\w]", " ", lines[0]).split()
                mergedLines = mergedLines + lines

    mergedLines = [
        ''.join(c for c in s if c not in string.punctuation)
        for s in mergedLines
    ]
    mergedLines = [i.lower() for i in mergedLines]
    mergedLines = [i for i in mergedLines if i not in stop]
    return mergedLines
def can_extract_raw_text():
    with open(test_path("simple-list.docx"), "rb") as fileobj:
        result = mammoth.extract_raw_text(fileobj=fileobj)
        assert_equal([], result.messages)
        assert_equal("Apple\n\nBanana\n\n", result.value)
def can_extract_raw_text():
    with open(test_path("simple-list.docx"), "rb") as fileobj:
        result = mammoth.extract_raw_text(fileobj=fileobj)
        assert_equal([], result.messages)
        assert_equal("Apple\n\nBanana\n\n", result.value)
 def set_text(self):
     return mammoth.extract_raw_text(self.file).value
Exemple #15
0
# In[ ]:

type(entries)

# In[ ]:

import glob
import errno
import mammoth
import pandas as pd
profiles = []
path = r'C:\Users\abdul\Downloads\RESUMES\*.docx'
files = glob.glob(path)
for name in files:
    document = open(name, 'rb')
    profiles.append((mammoth.extract_raw_text(document).value))

# In[ ]:

files

# In[ ]:

len(profiles)

# In[ ]:

all_Nigeria_uni = {
    'air force institute of technology', 'alex ekwueme university',
    'federal university gashua', 'federal university dutse',
    'federal university gusau ', 'federal university kashere',