Example #1
1
def convert_to_txt(file_path):
    logger.debug("convert_to_txt: %s" % file_path)
    words = None
    if not os.path.exists(file_path):
        logger.error("missing file %s", file_path)
    file_size = os.stat(file_path).st_size
    logger.debug("convert_to_txt: %d bytes at %s",file_size, file_path)
    ext = _get_extension(file_path)
    if ext == '.txt':
        logger.debug("loading txt file")
        worked = False
        try:
            encoding, file_handle, words = open_with_correct_encoding(file_path)
        except Exception as e:
            logger.error("Wasn't able to read the words from the file %s" % file_path)
            words = ""
    elif ext == '.docx':
        logger.debug("loading docx file")
        words = _docx_to_txt(file_path)
    elif ext == '.rtf':
        logger.debug("loading rtf file")
        doc = Rtf15Reader.read(open(file_path))
        words = PlaintextWriter.write(doc).getvalue()
    else:
        logging.warning("Couldn't find an extension on the file, so assuming text")
        with codecs.open(file_path, 'r', ENCODING_UTF_8) as myfile:
            words = myfile.read()
    logger.debug("loaded %d chars" % len(words))
    return words
Example #2
1
def upload(request):
	# user uploads a document -> convert into a dict of the terms found
	if request.FILES:
		if 'file' in request.FILES:
			result = ''
			f = request.FILES['file']
			fp = 'shake_v3/static/data/' + str(f)
			fp2 = fp[:len(fp)-3] + 'txt'
			if fp[len(fp)-3:len(fp)] == 'pdf':
				with open(fp, 'wb+') as pdff:
					for chunk in f.chunks():
						pdff.write(chunk)
				result = pdf_to_txt(fp)
				with open(fp2, 'wb+') as txtf:
					txtf.write(result)			
			elif fp[len(fp)-3:len(fp)] == 'rtf':
				with open(fp, 'wb+') as rtff:
					for line in f:
						rtff.write(line)
				doc = Rtf15Reader.read(open(fp, 'rb'))
				doctxt = PlaintextWriter.write(doc).getvalue()
				with open(fp2, 'wb+') as txtf:
					for line in doctxt:
						txtf.write(line)
				f = str(f)[:-4] + ".txt"
				result = doctxt
			else:
				with open(fp2, 'wb+') as txtf:
					for line in f:
						txtf.write(line)
				result = open(fp2, 'r').read()
		response_dict = generate_term_dict(result)
		response_dict['fp'] = 'static/data/' + str(f)
		return HttpResponse(simplejson.dumps(response_dict), mimetype='application/javascript')
	# user indicates terms -> give a grade
	elif request.POST:
		#TO DO: implement saving the data
		rating = ""
		score = custom_POST_to_score(request)
		if score > 4.5:
			rating = 'A+'
		elif score > 4:
			rating = 'A'
		elif score > 3.5:
			rating = 'B+'
		elif score > 3:
			rating = 'B'
		elif score > 2.5:
			rating = 'C+'
		elif score > 2:
			rating = 'C'
		elif score > 1:
			rating = 'D'
		else:
			rating = 'F'
		return HttpResponse(rating)
	# display the upload part 1
	else:
		score = 0
		return render_to_response('upload.html', {'score': score}, context_instance = RequestContext(request))
Example #3
0
def rtf_to_text(value):
    if len(value) == 0:
        return value
    rtf_doc = Rtf15Reader.read(BytesIO(value.encode("latin_1")))
    txt_doc = BytesIO()
    PlaintextWriter.write(rtf_doc, txt_doc, encoding="latin_1")
    return txt_doc.getvalue().decode("latin_1")
Example #4
0
def convertRtfToText(path):
    from pyth.plugins.rtf15.reader import Rtf15Reader
    from pyth.plugins.plaintext.writer import PlaintextWriter

    doc = Rtf15Reader.read(open('sample.rtf'))

    print(PlaintextWriter.write(doc).getvalue())
Example #5
0
def read_rtf_text(fp, errors='strict', encoding='utf-8'):
    doc = CustomRtf15Reader.read(fp, errors=errors)

    for p in doc.content:
        p.content = filter(paragraph_is_text_like, p.content)

    return PlaintextWriter.write(doc).read().decode(encoding)
Example #6
0
def read_rtf_text(fp, errors='strict', encoding='utf-8'):
    doc = CustomRtf15Reader.read(fp, errors=errors)

    for p in doc.content:
        p.content = filter(paragraph_is_text_like, p.content)

    return PlaintextWriter.write(doc).read().decode(encoding)
Example #7
0
def convert_to_txt(file_path):
    logger.debug("convert_to_txt: %s" % file_path)
    if not os.path.exists(file_path):
        logger.error("missing file %s", file_path)
    file_size = os.stat(file_path).st_size
    logger.debug("convert_to_txt: %d bytes at %s", file_size, file_path)
    ext = _get_extension(file_path)
    if ext == '.txt':
        logger.debug("loading txt file")
        try:
            encoding, file_handle, words = open_with_correct_encoding(
                file_path)
        except Exception:
            logger.error("Wasn't able to read the words from the file %s" %
                         file_path)
            words = ""
    elif ext == '.docx':
        logger.debug("loading docx file")
        words = _docx_to_txt(file_path)
    elif ext == '.rtf':
        logger.debug("loading rtf file")
        doc = Rtf15Reader.read(open(file_path))
        words = PlaintextWriter.write(doc).getvalue()
    else:
        logging.warning(
            "Couldn't find an extension on the file, so assuming text")
        with open(file_path, 'r') as myfile:
            words = myfile.read()
    logger.debug("loaded %d chars" % len(words))
    return words
Example #8
0
def FileProcessor(document_object):
    transcriptstring = PlaintextWriter.write(document_object).read()
    #
    # Removes non-text information from transcriptstring
    transcriptstring = re.sub(r'\b\w{50,}\b', '', transcriptstring)
    # print len(transcriptstring)
    #
    # Removes trailing spaces for each line in transcriptstring
    transcriptstring =\
        ''.join([line.rstrip()+'\n' for line in transcriptstring.splitlines()])
    #
    # Removes disclaimer at endcut
    callendmarker = '\n\n\n\n\n\n\n\n\n\n\n\n\n'
    endcut = transcriptstring.find(callendmarker)
    if endcut is not -1:
        transcriptstring = transcriptstring[:endcut]
    #
    # Strips out certain escape characters that don't seem to get handled above
    # Below doesn't work properly because it also deletes all "s" characters
    # escchars = '\xc2\xa9\xe2\x80\x99s'
    # replace_esc = string.maketrans(escchars,
    #                                ' '*len(escchars))
    # transcriptstring = transcriptstring.translate(replace_esc)
    #
    #
    # Decodes strings
    transcriptstring = transcriptstring.decode('utf-8', 'replace')
    transcriptnopunctuation = deletepunctuation(transcriptstring)
    # transcriptnopunctuation =\
    #     transcriptnopunctuation.decode('utf-8', 'replace')
    # print len(transcriptnopunctuation)
    return transcriptstring, transcriptnopunctuation
Example #9
0
 def read_recommendations(self, file_name):
     """
     Function reads the targeted values from the file "WHO Daily Recommended Values.rtf"
     It process the entries and creates a dictionary with
     Nutrient name as Key and Nutrient Value as value
     :param file_name:
     :return:
     """
     target = dict()
     filtered_col = list()
     doc = Rtf15Reader.read(open(file_name))
     entities = PlaintextWriter.write(doc).getvalue().split('\n\n')
     for item in entities:
         splited = item.split(',')
         name = splited[0].split('(')[0]
         value = splited[1]
         try:
             unit = splited[0].split('(')[1].split(')')[0]
         except:
             unit = ''
         # target.append({'nutrient': name,
         # 'unit': unit,
         # 'value': value})
         target.update({name: value})
         filtered_col.append(name)
     self.target_values = target
     return target, filtered_col
Example #10
0
def GetExternal(version, odl_data, source, class_id):
    external = ""

    for item in version[2]:
        if item[0] == "Attribute" \
                and item[1] == "_Art1_RTF":

            if len(item[2]) == 2:
                if isinstance(source, ZipFile):
                    data = source.open(item[2][0]).read()
                else:
                    file_name = join(source, item[2][0])
                    f = open(file_name, 'rb')
                    data = f.read()
                    f.close()
                data = data.replace("\x0c", "")
            elif len(item[2]) == 1:
                data = item[2][0]

            if data == "":
                return ""

            f = StringIO()
            f.write(data)
            doc = Rtf15Reader.read(f, clean_paragraphs = False)
            external = PlaintextWriter.write(doc).getvalue()
            external = external.replace("\n\n", "\n")

    return ReplaceTextNames(external, version, odl_data, class_id)
def analyze(committeeFile):
    
    try:
        doc = Rtf15Reader.read(open(committeeFile, "rb"))
    except:
        print "%s - skipped..." % committeeFile
        errFile = committeeFile.replace(global_options.indir, global_options.errdir)
        shutil.copyfile(committeeFile, errFile)
        return False

    #print PlaintextWriter.write(doc).getValue()

    f = open("test.out", 'w')
    f.write(PlaintextWriter.write(doc).getvalue())
    f.close()

    f = open("test.out", 'r')
    participants = find_participants(f.read())
    f.close()

    # Getting the indication whether the participant spoke in the committee
    f = open("test.out", 'r')
    docstring = f.read()
    for line in docstring.splitlines():
        name = ''
        if ":" in line:
            participant = line.split(":")[0]
            for p in participants:
                if participant in p['name']:
                    p['speaker'] = True
                    p['speak_count'] += 1

    f.close()

    fname = committeeFile.replace(global_options.indir, global_options.outdir)
    fname = fname.replace("rtf", "txt")
    file = codecs.open(fname, "w", "utf-8")

    for participant in participants:
        string_builder = []
        for key, val in participant.iteritems():
            string = u"'%s': '%s'"
            if val is not None:
                if type(val) == str:
                    val = val.replace("'", "")
                    val = val.replace('"', '')
                string = string % (key, print_unicode(val))
                string_builder.append(string)
        wrt_ln = ', '.join(string_builder)
        wrt_ln += ',\n'
        try:
            file.write(wrt_ln)

        except UnicodeEncodeError:
            print wrt_ln

    file.close()
    verbose("Generated participants file: " + fname)
    return True
Example #12
0
def load_stickies(path):
    stickies = []
    with open(path) as fd:
        for i,rtf in enumerate(parse_sticky_database(fd.read())):
            doc = Rtf15Reader.read(StringIO.StringIO(rtf))
            plaintext = PlaintextWriter.write(doc).getvalue()
            stickies.append(plaintext)
    return stickies
Example #13
0
def main():
    if len(sys.argv) < 2:
        print("usage %s <rtf_file_name> <txt_file_name>")
    else:
        doc = Rtf15Reader.read(open(os.path.join(sys.argv[1])))
        txt_filename = sys.argv[2]
        with open(os.path.join(txt_filename), "w") as of:
            of.write(PlaintextWriter.write(doc).getvalue())
def extract_terms(rtffile):
    """ Get data from rtffile """
    judges_list = []
    rtf_text = PlaintextWriter.write(rtffile).getvalue()
    lines = re.split('\n',rtf_text)
    for line in itertools.islice(lines, 0, None, 4): # 1: from the second line ([1]), 
        judges_list.append(line)              # None: to the end,
    return judges_list                                  # 2: step
Example #15
0
 def readRtf(self, path):
     try:
         doc = Rtf15Reader.read(open(path, "rb"))
     except:
         self._log("Some screwy rtf shit going on with " + path)
         return "Can't process ur shitty rtf <3 dfbot"
     contents = PlaintextWriter.write(doc).getvalue()
     #print contents
     return contents
def get_rtf_text(path):
	"""
	Take the path of an rtf file as an argument and return the text
	"""
	
		
	doc = Rtf15Reader.read(open(path))

	return PlaintextWriter.write(doc).getvalue()
def extract_terms(rtffile):
    """ Get data from rtffile """
    judges_list = []
    rtf_text = PlaintextWriter.write(rtffile).getvalue()
    lines = re.split('\n', rtf_text)
    for line in itertools.islice(lines, 0, None,
                                 4):  # 1: from the second line ([1]),
        judges_list.append(line)  # None: to the end,
    return judges_list  # 2: step
Example #18
0
 def readRtf(self, path):
     try:
         doc = Rtf15Reader.read(open(path, "rb"))
     except:
         self._log("Some screwy rtf shit going on with " + path)
         return "Can't process ur shitty rtf <3 dfbot"
     contents = PlaintextWriter.write(doc).getvalue()
     #print contents
     return contents
Example #19
0
	def parse(self, path):
		# Directory
		if os.path.isdir(path):
			raise NotImplementedError()
		# File
		else:
			doc = Rtf15Reader.read(open(path))
			sample = Sample(path, None, PlaintextWriter.write(doc).getvalue())
			return sample
Example #20
0
    def test_read2(self):
        rtf = StringIO("""{\\rtf1\\ansi\\ansicpg1252\\cocoartf1343\\cocoasubrtf160\\cocoascreenfonts1{\\fonttbl\\f0\\fnil\\fcharset222 Thonburi;}
{\\colortbl;\\red255\\green255\\blue255;}
\\pard\\tx560\\tx1120\\tx1680\\tx2240\\tx2800\\tx3360\\tx3920\\tx4480\\tx5040\\tx5600\\tx6160\\tx6720\\pardirnatural\\qc

{\\f0\\fs24 \\cf0 \\'b9\\'e9\\'d3\\'b5\\'a1}""")
        doc = Rtf15Reader.read(rtf)
        text = PlaintextWriter.write(doc).read()
        print text
        self.assertEquals(u"น้ำตก", text.decode('utf8'))
Example #21
0
def rtf_to_plain_text(file_name):
    print file_name
    out_file_name = './PlainText/%s.txt' % (file_name[:-4])
    fw = open(out_file_name, 'w')

    doc = Rtf15Reader.read(open(file_name, "r"))

    res = PlaintextWriter.write(doc).getvalue()
    fw.write(res)
    fw.close()
Example #22
0
def clean_rtf(fname):
    doc = Rtf15Reader.read(open(fname))
    plain = PlaintextWriter.write(doc).getvalue()
    lines = plain.split("\n")
    # print '#############################\norig: %s' % pprint.pformat(lines[:10])
    lines = filter(lambda l: len(l) > 0, lines)
    # print "##############################\nno blank lines:\t%s" % pprint.pformat(lines[:10])
    lines = [line.split(";") for line in lines]
    lines = [[val[1:-1] for val in line] for line in lines]
    # print "##############################\nsplit lines:\t%s" % pprint.pformat(lines[:10])
    return lines
Example #23
0
 def get_text(self):
     """
     return a unicode object from the rtf file
     """
     loc = self.get_file_loc()
     if loc:
         doc = Rtf15Reader.read(open(loc, "rb"))
         txt = PlaintextWriter.write(doc).getvalue()
         return txt.decode('utf-8')
     else:
         return u""
Example #24
0
def _rtf_to_txt(file_path, dst_dir, file_name):
    """
    Uses the pyth python module to extract text from a rtf file and save
    to .txt in dst_dir.
    """
    if file_name is None:
        file_name = os.path.split(file_path)[1]
    file_dst = os.path.join(dst_dir, re.sub(r'\.rtf$', '.txt', file_name))
    doc = Rtf15Reader.read(open(file_path))
    txt = PlaintextWriter.write(doc).getvalue()
    txt = unidecode(txt)
    with open(file_dst, 'w') as f:
        f.write(txt)
    return 0
Example #25
0
def _rtf_to_txt(file_path, dst_dir, file_name):
    """
    Uses the pyth python module to extract text from a rtf file and save
    to .txt in dst_dir.
    """
    if file_name is None:
        file_name = os.path.split(file_path)[1]
    file_dst = os.path.join(dst_dir, re.sub(r'\.rtf$', '.txt', file_name))
    doc = Rtf15Reader.read(open(file_path))
    txt = PlaintextWriter.write(doc).getvalue()
    txt = unidecode(txt)
    with open(file_dst, 'w') as f:
        f.write(txt)
    return 0
Example #26
0
def getFileText(file_path, html=False, pdf_utf8=False):
    '''
    input: string of file path
    output: either raw string or parsed html text content
    '''
    file_extension = os.path.splitext(file_path)[1]
    if file_extension.lower() != ".py":
        if file_extension.lower() == ".html" or file_extension.lower(
        ) == '.htm':
            file_content = open(file_path).read()
            if html:
                try:
                    html_text = lh.fromstring(file_content).text_content()
                    return html_text
                except UnicodeDecodeError:
                    try:
                        html_text = lh.fromstring(
                            helpers.convert_encoding(
                                file_content)).text_content()
                    except UnicodeDecodeError:
                        html_text = lh.fromstring(
                            unicode(file_content,
                                    errors='ignore')).text_content()
                        return html_text
                    return html_text
            else:
                return file_content
        if file_extension == ".pdf":
            pdf_content = open(file_path, "rb")
            pdfReader = PyPDF2.PdfFileReader(pdf_content)
            num_pages = pdfReader.getNumPages()
            page_text = ""
            for i in range(0, num_pages):
                pageObj = pdfReader.getPage(i)
                page_text = page_text + " " + pageObj.extractText()
            # Need to check for pdfs that are just scanned images
            if len(page_text) <= num_pages:
                return None
            else:
                if pdf_utf8:
                    return page_text.encode('utf-8')
                else:
                    return page_text
        if file_extension == ".rtf":
            doc = Rtf15Reader.read(open(file_path))
            page_text = PlaintextWriter.write(doc).getvalue()
            uni_page_text = page_text.decode('utf-8')
            return uni_page_text
    return None
Example #27
0
    def testmethod(self):  # the test method to be added
        inputfilename = os.path.join(rtfinputsdir, basename+".rtf")
        outputfilename = os.path.join(testoutputdir, 
                                      "%s.%s" % (basename, writer))
        #--- obtain reference output or skip test:
        with open(referencefilename, "rb") as input:
            the_referenceoutput = input.read()
        #--- read and convert RTF:
        with open(inputfilename, "rb") as input:
            document = Rtf15Reader.read(input)
        if writer == 'html':
            the_testoutput = XHTMLWriter.write(document, pretty=True).read()
            write_html_file(outputfilename, the_testoutput, print_msg=False)
        elif writer == 'txt':
            with open(outputfilename, "wt") as f:
                PlaintextWriter.write(document, f)

        #--- compute test output:
        with open(outputfilename, "rb") as input:
            the_testoutput = input.read()
        #--- check outcome:
        if the_testoutput == the_referenceoutput:
            os.remove(outputfilename)  # assert will succeed, so it is no longer needed
        self.assertEqual(the_testoutput, the_referenceoutput)
Example #28
0
def Run(journal_file):
    raw_entries = plistlib.readPlist(journal_file)

    acc = utils.EntryAccumulator(lambda x: x['date'])
    for k, v in raw_entries.iteritems():
        if not v: continue
        # 12/29/2001 -> 2001-12-29
        new_k = re.sub(r'(\d\d)/(\d\d)/(\d\d\d\d)', r'\3-\1-\2', k)
        d = parser.parse(new_k)

        if isinstance(v, plistlib.Data):
            f = StringIO.StringIO(v.data)
            try:
                doc = Rtf15Reader.read(f)
            except ValueError as e:
                print v.data
                raise e
            txt = PlaintextWriter.write(doc).getvalue()
            acc.add({'date': d, 'rtf': v.data, 'text': txt})
        else:
            acc.add({'date': d, 'text': v})

    for day, entries in acc.iteritems():
        assert len(entries) == 1
        entry = entries[0]

        if not entry['text']:
            continue

        summary = utils.SummarizeText(entry['text'])
        utils.WriteSingleSummary(day,
                                 maker='osxapp',
                                 summary=summary,
                                 dry_run=dry_run)
        if 'rtf' in entry:
            utils.WriteOriginal(day,
                                maker='osxapp',
                                contents=entry['rtf'],
                                filename='journal.rtf',
                                dry_run=dry_run)
        else:
            utils.WriteOriginal(day,
                                maker='osxapp',
                                contents=entry['text'].encode('utf8'),
                                filename='journal.txt',
                                dry_run=dry_run)
Example #29
0
    def _convert_rtf_to_text(self, password=None):
	input_rtf = self.cvFile
	rtf = Rtf15Reader.read(open(input_rtf))
	outputPath = self.scratchDir
    	inputPath = os.getcwd()
    	if os.path.exists(input_rtf):
            inputPath = os.path.dirname(input_rtf)
    	input_filename = os.path.basename(input_rtf)
    	input_parts = input_filename.split(".")
    	input_parts.pop()
	randomStr = int(time.time())
    	output_filename = outputPath + os.path.sep + ".".join(input_parts) + randomStr.__str__() + r".txt"
	self.cvTextFile = output_filename
	fw = open(self.cvTextFile, "w")
	fw.write(PlaintextWriter.write(rtf).getvalue())
	fw.close()
	return (0)
Example #30
0
def loadAllRTFToDB(folderPath):
	db = DBController()
	for dirPath, dirNames, fileNames in os.walk(folderPath):
		for fileName in fileNames:
			if not fileName.endswith('.rtf'):
				continue
			filePath = os.path.join(dirPath, fileName)
			print(filePath)
			try:
				doc = Rtf15Reader.read(open(filePath))
				text = PlaintextWriter.write(doc).getvalue()
			except:
				continue
			lines = [line.strip() for line in text.split('\n') if line]
			articleLinesDict, articleStartIndex = {}, 0
			for i, line in enumerate(lines):
				if line.startswith('Document ') and len(line.split(' ')) == 2:
					articleId = line.split(' ')[-1]
					articleLinesDict[articleId] = lines[articleStartIndex : i]
					articleStartIndex = i + 1

			for articleId, lines in articleLinesDict.iteritems():
				bylineIndex, wordCountIndex, textStartIndex = -1, -1, -1
				for i, line in enumerate(lines):
					line = line.lower()
					if line.startswith('by '):
						bylineIndex = i
					elif line.endswith(' words'):
						wordCountIndex = i
					elif line == 'english':
						textStartIndex = i + 2

				if wordCountIndex == -1 or textStartIndex == -1 or wordCountIndex > textStartIndex:
					print(filePath + ', ' + articleId)
				else:
					articleDict = {'_id': articleId,
					               'filePath' : filePath.split('Marshall_RA/')[-1],
					               'headline': ' '.join(lines[: wordCountIndex]) if bylineIndex == -1 else ' '.join(lines[: bylineIndex]),
					               'byline' : '' if bylineIndex == -1 else lines[bylineIndex],
					               'date' : parser.parse(lines[wordCountIndex + 1]),
					               'sourceName' : lines[wordCountIndex + 2] if lines[wordCountIndex + 2].find(' AM') == -1 and lines[wordCountIndex + 2].find(' PM') == -1 else lines[wordCountIndex + 3],
					               'leadParagraph' : '',
					               'tailParagraph' : '\n'.join(lines[textStartIndex:]),
					               'sourceCode' : '', 'industry' : [], 'region' : [], 'newsSubject' : [], 'company' : []}
					db.saveArticle(articleDict)
Example #31
0
def documentToText(path):
    if path[-4:] == ".doc":
        cmd = ['antiword', path]
        p = Popen(cmd, stdout=PIPE)
        stdout, stderr = p.communicate()
        return removeNonAscii(stdout)
    elif path[-5:] == ".docx":
        return removeNonAscii(doc.process(path))
    elif path[-4:] == ".txt":
        inputFile = open(path)
        text = inputFile.read() #Because memory and such
        inputFile.close()
        return(removeNonAscii(text))
    elif path[-4:] == ".pdf":
        return removeNonAscii(convert_pdf_to_txt(path))
    elif path[-4:] == ".rtf":
        text = Rtf15Reader.read(open(path))
        return removeNonAscii(PlaintextWriter.write(text).getvalue())
    return "Returned Nothing."
def Run(journal_file):
  raw_entries = plistlib.readPlist(journal_file)

  acc = utils.EntryAccumulator(lambda x: x['date'])
  for k, v in raw_entries.iteritems():
    if not v: continue
    # 12/29/2001 -> 2001-12-29
    new_k = re.sub(r'(\d\d)/(\d\d)/(\d\d\d\d)', r'\3-\1-\2', k)
    d = parser.parse(new_k)

    if isinstance(v, plistlib.Data):
      f = StringIO.StringIO(v.data)
      try:
        doc = Rtf15Reader.read(f)
      except ValueError as e:
        print v.data
        raise e
      txt = PlaintextWriter.write(doc).getvalue()
      acc.add({
        'date': d,
        'rtf': v.data,
        'text': txt
      })
    else:
      acc.add({
        'date': d,
        'text': v
      })

  for day, entries in acc.iteritems():
    assert len(entries) == 1
    entry = entries[0]

    if not entry['text']:
      continue

    summary = utils.SummarizeText(entry['text'])
    utils.WriteSingleSummary(day, maker='osxapp', summary=summary, dry_run=dry_run)
    if 'rtf' in entry:
      utils.WriteOriginal(day, maker='osxapp', contents=entry['rtf'], filename='journal.rtf', dry_run=dry_run)
    else:
      utils.WriteOriginal(day, maker='osxapp', contents=entry['text'].encode('utf8'), filename='journal.txt', dry_run=dry_run)
Example #33
0
def handle_files():
    '''
    The main function to start processing the rtf files
    into csv
    '''
    file_prefix = "old_committee-meetings-protocols"
    for file_name in glob2.glob(protocol_dir):
        if file_prefix in file_name:
            doc = Rtf15Reader.read(open(file_name))
            data = PlaintextWriter.write(doc).getvalue()
            data = data.split(':')
            for leg in data:
                if "מוזמנים" in leg:
                    index_visitor = data.index(leg) + 1
                    name = ({'header': 'מוזמנים', 'body': data[index_visitor]})
                    dir_file = file_name.replace('.rtf', '.csv')
                    with open(dir_file, 'w') as f:
                        w = csv.DictWriter(f, name.keys())
                        w.writeheader()
                        w.writerow(name)
                        break
Example #34
0
def upload_file(request):
    error_message = ""
    if request.method == "POST":
        form = UploadForm(request.POST, request.FILES)
        if form.is_valid():
            doc_name = UploadedFile(request.FILES["doc_file"])
            doc_uploaded_date = timezone.now()
            doc = request.FILES["doc_file"]

            if get_file_type(doc_name) == ".rtf":
                result = Rtf15Reader.read(doc)
                parser = LawHtmlParser(PlaintextWriter.write(result).read())
            elif get_file_type(doc_name) == ".txt":
                parser = LawHtmlParser(doc.read())
            parsed_doc_content = parser.get_parsed_text()
            new_doc = Document(name=doc_name, content=parsed_doc_content, uploaded_date=doc_uploaded_date, file=doc)
            new_doc.save()
            return HttpResponseRedirect(reverse("document:list"))
        else:
            error_message = "Please select a file."

    form = UploadForm()
    return render(request, "document/upload.html", {"form": form, "error_message": error_message})
Example #35
0
def parseRTFstring(rtfSTRING):
    doc = Rtf15Reader.read(rtfSTRING)
    #print PlaintextWriter.write(doc).getvalue()
    return PlaintextWriter.write(doc).getvalue()
Example #36
0
totalInvalidNamesCount = 0
for committeeFile in committeeFiles:
    print "===================="
    print str(progressCount) + " - analyzing " + committeeFile
    progressCount -= 1
    try:
        doc = Rtf15Reader.read(open(committeeFile, "rb"))
    except:
        print committeeFile + " - skipped..."
        errFile = committeeFile.replace(srcDir, "errFiles")
        shutil.copyfile(committeeFile, errFile)
        skippedFilesCount += 1
        continue

    f = open("test.out", "w")
    f.write(PlaintextWriter.write(doc).getvalue())
    f.close()

    f = open("test.out", "r")
    docstring = f.read()

    #    print docstring
    participants = []
    agendas = []
    agendaIndex = 1
    addParticipant = False
    handleAgenda = False
    invalidNamesCount = 0
    digit_sep = [".", " ", "(", ")"]
    for line in docstring.splitlines():
        #    print line
Example #37
0
def parse():
    committeeIdsPerYear = {}
    for csvFileName in glob.glob(CSV_PATH):
        if not re.match('.+\d{4}\.csv$', csvFileName):
            continue

        year = csvFileName[-8:-4]
        if int(year) > 2010:
            continue

        committeeIdsPerYear[year] = set()
        
        with open(csvFileName) as csvFile:
            for row in csv.reader(csvFile):
                committeeId = row[8]
                if row[0].strip() == year:
                    if row[6] == '2' and committeeId != '0':
                        committeeIdsPerYear[year].add(committeeId)
                    committeeIdsPerYear[year].add("%03d-%02d" % (int(row[2]),int(row[1])))

    requestToMeetingMap = {}
    with open('./log.txt', 'w+') as outputFile:
        rtfFiles = glob.glob(RTF_PATH)
        rtfFiles.sort()
        for fileName in rtfFiles:
            if not fileName.endswith('.rtf'):
                continue

            year = fileName[:4]
            if year not in committeeIdsPerYear:
                continue
            if year not in requestToMeetingMap:
                requestToMeetingMap[year] = {}

            hadAnything = False
            with open(fileName) as rtfFile:
                parsedName = re.findall(u'\d+', fileName)
                date, meetingId = str('/'.join(parsedName[0:3][::-1])), parsedName[-1] if len(parsedName) > 3 else '00'
                try:
                    doc = Rtf15Reader.read(rtfFile)
                except Exception, e:
                    print "failed to parse %s: %s" % (fileName,e)
                    continue

                for line in PlaintextWriter.write(doc):
                    line = unicode(line, encoding='utf-8')

                    if u'אושר' in line and len(line)<95:
                        print fileName+":"+line
                    for _request in REQUEST_RE.findall(line) + APPROVEDLINE_RE.findall(line):
                        request = _request[0]
                        hadAnything = True
                        ids = set()
                        doubles2 = set(DOUBLES_RE2.findall(request))
                        for d in doubles2:
                            p1,p2 = d
                            combination_id = '%s-%s' % (p2,p1)
                            print "D2 "+combination_id
                            if combination_id in committeeIdsPerYear[year]:
                                ids.add(combination_id)
                        request = DOUBLES_RE2.subn('',request)[0]
                        doubles3 = set(DOUBLES_RE3.findall(request))
                        for d in doubles3:
                            p1,p2 = d
                            combination_id = '%s-%s' % (p1,p2)
                            print "D3 "+combination_id
                            if combination_id in committeeIdsPerYear[year]:
                                ids.add(combination_id)
                        request = DOUBLES_RE3.subn('',request)[0]
                        doubles = set(DOUBLES_RE.findall(request))
                        for d in doubles:
                            p1,sep,p2 = d
                            combination_id = '%s-%s' % (p1,p2)
                            if p1.startswith('0') and sep == '-':
                                ids.add(combination_id)
                            else:
                                if combination_id in committeeIdsPerYear[year]:
                                    ids.add(combination_id)
                                else:
                                    p1 = int(p1)
                                    p2 = int(p2)
                                    if abs(p2-p1) < 100:
                                        ids.update(map(str,range(min(p1,p2),max(p1,p2)+1)))
                        request = DOUBLES_RE.subn('',request)[0]
                        singles = SINGLES_RE.findall(request)
                        ids.update(singles)
                            
                        for word in BAD_WORDS:
                            if word in request:
                                ids = set()

                        res = list(ids & committeeIdsPerYear[year])

                        outputFile.write(fileName + ': ' + _request[0].strip().encode('utf-8')+"  " +repr(ids)+"->"+repr(res)+"\n")
                        outputFile.flush()
                                                      
                        for requestId in res:
                            requestToMeetingMap[year][requestId] = [date, meetingId]
            
            if not hadAnything:
                pass #os.rename(fileName,fileName+".empty")
Example #38
0
def unrtf(value):
	return PlaintextWriter.write(Rtf15Reader.read(StringIO(value))).getvalue()
Example #39
0
def rtf(from_file, to_txt, opts):
    doc = Rtf15Reader.read(open(from_file.path, "rb"))
    text = PlaintextWriter.write(doc).getvalue()
    return save_raw_data(to_txt.path, text)
Example #40
0
                ftags = ''
                fmetd = str(meta.to_json(indent=2)).replace("'", "\\'")
                floc = os.path.join(dirname, filename)
                modtime = meta["file_timestamps"]["modified"]

                print "Name : " + filename
                print "Size : " + str(fsize)
                print "Location :" + floc
                print "Extension : " + fext
                print "Type : " + ftype
                print "Modified Time : " + modtime
                print "category : " + fcat

                if fext == "rtf":
                    doc = Rtf15Reader.read(open(floc))
                    text_article = PlaintextWriter.write(doc).getvalue()
                    abc = str(keywords(text_article))
                    abc = abc.replace("'", "\\'")
                    ftags = abc

                if fext == "txt":
                    doc = open(floc).read()
                    abc = str(keywords(doc))
                    abc = abc.replace("'", "\\'")
                    ftags = abc

                query = "INSERT INTO `datainfo`( `name`, `file type`, `extension`, `size`, `category`, `tags`, `metadata`,`location`,`modified_time`) VALUES ('" + filename + "','" + ftype + "','" + fext + "','" + str(
                    fsize
                ) + "','" + fcat + "','" + ftags + "','" + fmetd + "','" + floc + "','" + modtime + "')"

                x.execute(query)
Example #41
0
def parse():
    committeeIds = {}
    for csvDir, _, csvFiles in os.walk('./csv'):
        for csvFileName in csvFiles:
            if not re.match('\w+\d{4}\.csv$', csvFileName):
                continue

            year = csvFileName[7:11]
            committeeIds[year] = set()

            with open(os.path.join(csvDir, csvFileName)) as csvFile:
                for row in csv.reader(csvFile):
                    committeeId = row[8]
                    if row[6] == '2' and committeeId != '0':
                        committeeIds[year].add(committeeId)
    dictionary = {}
    with open('./log.txt', 'w+') as outputFile:
        for rtfDir, _, rtfFiles in os.walk('./rtf'):
            for fileName in rtfFiles:
                if not fileName.endswith('.rtf'):
                    continue

                year = fileName[:4]
                if year not in dictionary:
                    dictionary[year] = {}

                with open(os.path.join(rtfDir, fileName)) as rtfFile:
                    parsedName = re.findall(u'\d+', fileName)
                    date, meetingId = str(
                        '/'.join(parsedName[0:3][::-1])
                    ), parsedName[-1] if len(parsedName) > 3 else '00'
                    try:
                        doc = Rtf15Reader.read(rtfFile)
                    except Exception:
                        continue

                    for line in PlaintextWriter.write(doc):
                        line = unicode(line, encoding='utf-8')
                        if len(line) < 95 and re.match(approvedLine, line):
                            res = list(
                                set(re.findall(u'\d+', line))
                                & committeeIds[year])
                            if len(res) > 0:
                                outputFile.write(fileName + ': ' +
                                                 line.encode('utf-8'))
                                for requestId in res:
                                    dictionary[year][requestId] = [
                                        date, meetingId
                                    ]

    for csvDir, _, csvFiles in os.walk('./csv'):
        for csvFileName in csvFiles:
            if not re.match('\w+\d{4}\.csv$', csvFileName):
                continue

            year = csvFileName[7:11]
            if not year in dictionary:
                continue

            with open(os.path.join(csvDir, csvFileName)) as csvFile:
                with open(
                        os.path.join(
                            csvDir,
                            csvFileName[:-4] + '_out' + csvFileName[-4:]),
                        'w+') as outputCsv:
                    writer = csv.writer(outputCsv)
                    reader = csv.reader(csvFile)
                    writer.writerow(reader.next() + headerColumns)
                    for row in csv.reader(csvFile):
                        committeeId = '' if len(row) < 9 else row[8]
                        writer.writerow(row + (
                            ['', ''] if committeeId not in dictionary[year]
                            else dictionary[year][committeeId]))
Example #42
0
def document_to_text(file_path):
    """
    Convert document file (.pdf, .doc, .docx, .odt, .rtf) into plain text.

    * Additional dependency 'antiword' and 'odt2txt' command is required to run this function.
    * Converting pdf file takes much more time than others

    :rtype : string
    :param file_path:
    :return: text-converted version of document file contents
    """
    dir_name, file_name = os.path.split(file_path)

    def convert_pdf_to_txt(path):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                      check_extractable=True):
            interpreter.process_page(page)

        fp.close()
        device.close()
        str = retstr.getvalue()
        retstr.close()
        return str

    if file_name[-4:] == ".doc":
        cmd = ['antiword', file_path]
        p = Popen(cmd, stdout=PIPE)
        stdout, stderr = p.communicate()
        if len(stdout) > 0:
            return stdout.decode('ascii', 'ignore')
        else:
            # try .rtf format when it's not .doc file
            try:
                doc = Rtf15Reader.read(open(file_path))
                return PlaintextWriter.write(doc).getvalue()
            except:
                pass
    elif file_name[-5:] == ".docx":
        document = opendocx(file_path)
        paratextlist = getdocumenttext(document)
        newparatextlist = []
        for paratext in paratextlist:
            newparatextlist.append(paratext.encode("utf-8"))
        return '\n\n'.join(newparatextlist)
    elif file_name[-4:] == ".odt":
        cmd = ['odt2txt', file_path]
        p = Popen(cmd, stdout=PIPE)
        stdout, stderr = p.communicate()
        return stdout.decode('ascii', 'ignore')
    elif file_name[-4:] == ".pdf":
        return convert_pdf_to_txt(file_path)
    elif file_name[-4:] == ".rtf":
        doc = Rtf15Reader.read(open(file_path))
        return PlaintextWriter.write(doc).getvalue()
Example #43
0
def readfile(file):
    try:
        if file.startswith('https://') or file.startswith(
                'http://') or file.startswith('ftp://'):
            data = BytesIO(download(file))
        else:
            data = open(file, 'rb')

        if file.endswith('.caj') or file.endswith('.pdf'):
            with StringIO() as outfp:
                rsrcmgr = PDFResourceManager()
                device = TextConverter(rsrcmgr, outfp)
                process_pdf(rsrcmgr, device, data)
                return outfp.getvalue()
        elif file.endswith('.doc'):
            text = ''
            document = olefile.OleFileIO(data)

            wordDocument = document.openstream('WordDocument').read()

            # Parsing the WordDocument Stream
            # See https://msdn.microsoft.com/en-us/library/office/dd904907(v=office.14).aspx
            # And http://b2xtranslator.sourceforge.net/howtos/How_to_retrieve_text_from_a_binary_doc_file.pdf

            # Loading the FIB
            fib = wordDocument[:1472]

            # Loading and Parsing the piece table
            fcClx = int.from_bytes(fib[0x01A2:0x01A5], byteorder='little')
            lcbClx = int.from_bytes(fib[0x01A6:0x01A9], byteorder='little')

            tableFlag = ((int.from_bytes(
                fib[0x000A:0x000E], byteorder='little') & 0x0200) == 0x0200)
            tableName = ('0Table', '1Table')[tableFlag]

            table = document.openstream(tableName).read()

            clx = table[fcClx:fcClx + lcbClx]

            pos = 0
            pieceTable = ''
            lcbPieceTable = 0
            while True:
                if clx[pos] == 2:
                    # this entry is the piece table
                    lcbPieceTable = int.from_bytes(clx[pos + 1:pos + 5],
                                                   byteorder='little')
                    pieceTable = clx[pos + 5:pos + 5 + lcbPieceTable]
                    break
                elif clx[pos] == 1:
                    # skip this entry
                    pos = pos + 1 + 1 + ord(clx[pos + 1])
                else:
                    break

            i = 1
            pieceCount = (lcbPieceTable - 4) / 12
            while i <= pieceCount:
                cpStart = int.from_bytes(pieceTable[i * 4:i * 4 + 4],
                                         byteorder='little')
                cpEnd = int.from_bytes(pieceTable[(i + 1) * 4:(i + 1) * 4 + 4],
                                       byteorder='little')

                offsetPieceDescriptor = int(((pieceCount + 1) * 4) + (i * 8))
                pieceDescriptor = pieceTable[
                    offsetPieceDescriptor:offsetPieceDescriptor + 8]

                fcValue = int.from_bytes(pieceDescriptor[2:6],
                                         byteorder='little')
                isANSII = (fcValue & 0x40000000) == 0x40000000
                fc = fcValue & 0xBFFFFFFF

                encoding = ('utf-16', 'cp1252')[isANSII]
                cb = cpEnd - cpStart
                cb = (cb * 2, cb)[isANSII]
                text += wordDocument[fc:fc + cb].decode(encoding)

                i += 1

            return text
        elif file.endswith('.docx'):
            text = ''
            document = Document(data)

            text += '\n\n'.join(
                [paragraph.text for paragraph in document.paragraphs])

            for table in document.tables:
                text += _parse_docx_table(table, text)

            return text
        elif file.endswith('.htm') or file.endswith('.html'):
            html = html2text.HTML2Text()
            html.ignore_links = True
            return html.handle(data.read().decode('utf-8'))
        elif file.endswith('.rtf'):
            with BytesIO() as outfp:
                document = Rtf15Reader.read(data)
                return PlaintextWriter.write(document, outfp).getvalue()
        elif file.endswith('.txt'):
            return data.read()
        else:
            raise Exception('Unknown file extension')
    except:
        pass
Example #44
0
def convert(path, f, target=None):
    import re  #für Regular Expressions (google es ;))
    from collections import OrderedDict  #andernfalls würde sich Vers1, Chorus usw immer alphabetisch sortieren
    from pandas import read_csv, Index
    #alle Imports für die PDF Generierung (unbedingt für nächster Zelle mit pyth ausführen)
    from reportlab.pdfgen import canvas
    from reportlab.lib.pagesizes import A4
    from reportlab.lib.units import inch
    from reportlab.pdfbase.pdfmetrics import stringWidth
    from reportlab.lib import styles, colors
    from reportlab.platypus import Paragraph
    #RTF einlese Package
    from pyth.plugins.rtf15.reader import Rtf15Reader
    from pyth.plugins.plaintext.writer import PlaintextWriter
    #RTF einlesen
    doc = Rtf15Reader.read(f)
    raw = PlaintextWriter.write(doc).getvalue()
    pattern = "(^[\xc3\x9f\xc3\x84\xc3\x96\xc3\x9c\xc3\xa4\xc3\xbc\xc3\xb6\xe2\x80\x99,\w\s]+\n+)(key:[\w#]+\n+)?(bpm:[\d]+\n+)?(.+)(CCLI Song # (\d+)\\n+(.+)\\n+\\xc2\\xa9 (.+))"
    match = re.search(pattern, raw, re.DOTALL)
    info_dict = {}
    info_dict['title'] = match.group(1).replace('\n', '')
    if match.group(2):
        info_dict['key'] = match.group(2).replace('\n', '').replace('key:', '')
    else:
        print "No key found"
    if match.group(3):
        info_dict['bpm'] = match.group(3).replace('\n', '').replace('bpm:', '')
    else:
        print "No bpm found"
    info_dict['song'] = match.group(4)
    info_dict['ccli_nr'] = match.group(6)
    info_dict['composer'] = match.group(7).replace('\n', '')
    info_dict['copyright'] = match.group(8)
    akkorde = read_csv("Akkorde.csv", sep=";")

    def getTransformedKey(source, target, chord):
        return (akkorde[target][Index(akkorde[source]).get_loc(chord)])

    def replChords(matchObj):
        return ('[' + getTransformedKey(
            source=info_dict['key'], target=target, chord=matchObj.group(1)) +
                ']')

    def transform():
        info_dict['song'] = re.sub('\[([\w\d#/]+)\]', replChords,
                                   info_dict['song'])
        info_dict['key'] = target

    #target = request.form['trans_key']
    if (target and target != info_dict['key']):
        transform()
    #Einzelne Zeilen aus dem RTF in Liste laden
    line_list = info_dict.get('song').split('\n\n')
    line_list
    pattern = '^(Verse\s?\d*|Chorus\s?\d*|Instrumental|Bridge|Pre-Chorus|Intro)$'  #Dieses Pattern funktioniert auf alles VersX und Chorus in eckiger Klammer (porbier regexr.com)
    song_dict = OrderedDict()  #Das oben erwähnte Ordered Dict
    in_element = False  #mit diesem Flag könnte man sich später noch title: composer: key: usw holen (so weit bin ich noch nicht)
    element = None  #hier wird gleich drin gespeichert, in welcher Untergruppe wir jeweils sind
    for i in range(len(line_list)):
        if in_element:  #wenn wir in einem Element sind, werden alle folgenden Zeilen zu diesem Eintrag hinzugefügt
            if not re.search(pattern, line_list[i]):
                song_dict[element].extend([line_list[i]])
        match = re.search(
            pattern, line_list[i]
        )  #Bis wir den ersten Match haben (zB VersX oder Chorus), gibt es auch kein Element
        if match:  #Wenn wir jedoch ein Match haben, sind wir in einem Element. Dieses wird neu im Dictonary angelegt.
            in_element = True
            element = match.group(1)
            song_dict[element] = [
            ]  #Wir geben an, dass hinter diesem Dictonary Eintrag eine neue Liste steht.

    def createPDF(fontSize=13):
        width, height = A4  #keep for later
        font = 'Helvetica'
        lineHeight = fontSize + .75 * fontSize
        wordSpace = 3
        boarder = inch
        topBoarder = .75 * boarder
        instrSpaces = 5
        chordstyle = styles.ParagraphStyle('chord')
        chordstyle.fontSize = fontSize
        hstyle = styles.ParagraphStyle('heading')
        hstyle.fontSize = fontSize + 1
        tstyle = styles.ParagraphStyle('title')
        tstyle.fontSize = fontSize + 5
        copyrightstyle = styles.ParagraphStyle('copyright')
        copyrightstyle.fontSize = 8

        pattern = '\[([\w\d#/]+)\]'
        y = height - topBoarder - fontSize
        x = boarder
        realWidth = width - 2 * boarder
        c = canvas.Canvas(path + info_dict['title'] + '-' + info_dict['key'] +
                          '.pdf',
                          pagesize=A4)
        c.setFont(font, fontSize - 1)

        P1 = Paragraph("<u><b>" + info_dict['title'] + "</b></u>", tstyle)
        P1.wrap(realWidth, height)
        P1.drawOn(c, x, y)

        if info_dict.has_key('key'):
            P1 = Paragraph("<b>" + info_dict['key'] + "</b>", chordstyle)
            P1.wrap(realWidth, height)
            P1.drawOn(
                c, width - boarder -
                stringWidth(info_dict['key'], font, chordstyle.fontSize), y)
        if info_dict.has_key('bpm'):
            c.drawRightString(width - boarder, y - lineHeight,
                              '%s' % info_dict['bpm'])
        P1 = Paragraph(info_dict['composer'], copyrightstyle)
        P1.wrap(realWidth, height)
        P1.drawOn(c, x, y - lineHeight)

        c.setFont(font, fontSize)
        y -= hstyle.fontSize + 2 * lineHeight

        for key in song_dict:
            P1 = Paragraph("<b><i>" + key + "</i></b>", hstyle)
            P1.wrap(realWidth, height)
            P1.drawOn(c, x, y)
            xOfLast = boarder
            lineCount = 0
            if re.search(pattern, song_dict.get(key)[0]):
                y -= 1.8 * (
                    lineHeight
                )  #Abstand von Überschrift zu erster Zeile wenn Akkorde
            else:
                y -= 1.2 * (
                    lineHeight
                )  #Abstand von Überschrift zu erster Zeile wenn keine Akkorde
            if (key in ["Instrumental", "Intro"]):
                for line in song_dict.get(key):
                    line = line.replace('[', '').replace(']', '').replace(
                        ' ', '&nbsp;' * (instrSpaces))
                    P1 = Paragraph("<b>" + line + "</b>", chordstyle)
                    P1.wrap(realWidth, height)
                    P1.drawOn(c, x, y)
                    y -= 1.5 * lineHeight  #Abstand nach jedem Abschnitt
            else:
                for line in song_dict.get(key):
                    if ((xOfLast + stringWidth(line, font, fontSize)) <
                        (width - boarder)) and (lineCount < 2):
                        x = xOfLast
                        lineCount += 1
                    elif not re.search(pattern, line):
                        y -= 1 * lineHeight
                    else:
                        y -= 1.5 * lineHeight
                        lineCount = 1
                    line = line.decode('utf-8')
                    last_was_chord = False
                    x_min = 0
                    cursor = 0
                    while cursor < len(line):
                        l = line[cursor]
                        if l == ' ':
                            if last_was_chord:
                                x += last_cord_length
                                last_was_chord = False
                            else:
                                x += wordSpace
                        elif l == '[':
                            end = line.find(']', cursor)
                            chord = line[cursor + 1:end]
                            P1 = Paragraph("<b>" + chord + "</b>", chordstyle)
                            P1.wrap(realWidth, height)
                            if x < x_min:
                                x = x_min
                            P1.drawOn(c, x, y + fontSize + 0.01 * fontSize**2)
                            cursor = end
                            last_was_chord = True
                            last_cord_length = stringWidth(
                                chord, font, fontSize)
                            x_min = x + last_cord_length + wordSpace * 7
                        else:
                            last_was_chord = False
                            c.drawString(x, y, l)
                            x += stringWidth(l, font, fontSize)
                        cursor += 1
                    xOfLast = x + wordSpace
                    x = boarder
                y -= 1.5 * lineHeight  #Abstand nach jedem Abschnitt

        P1 = Paragraph(
            ('© ') + info_dict['copyright'] +
            '<br/>Gebrauch nur zur Nutzung im Rahmen von Veranstaltungen der City Chapel Stuttgart',
            copyrightstyle)
        P1.wrap(realWidth, height)
        P1.drawOn(c, x, boarder - P1.height)  # + lineHeight)

        c.showPage()
        c.save()
        return (y < boarder)

    nochmal = True
    fontSize = 13
    while (nochmal):
        nochmal = createPDF(fontSize)
        fontSize -= .5
Example #45
0
def extractstringfromRTF(path, filename):
    document_object = Rtf15Reader.read(open(path + filename, "rb"))
    transcriptstring = PlaintextWriter.write(document_object).read()
    return transcriptstring
Example #46
0
def parse():
    committeeIds = {}
    for csvDir, _, csvFiles in os.walk('./csv'):
        for csvFileName in csvFiles:
            if not re.match('\w+\d{4}\.csv$', csvFileName):
                continue

            year = csvFileName[7:11]
            committeeIds[year] = set()

            with open(os.path.join(csvDir, csvFileName)) as csvFile:
                for row in csv.reader(csvFile):
                    committeeId = row[8]
                    if row[6] == '2' and committeeId != '0':
                        committeeIds[year].add(committeeId)
    dictionary = {}
    with open('./log.txt', 'w+') as outputFile:
        for rtfDir, _, rtfFiles in os.walk('./rtf'):
            for fileName in rtfFiles:
                if not fileName.endswith('.rtf'):
                    continue

                year = fileName[:4]
                if year not in dictionary:
                    dictionary[year] = {}

                with open(os.path.join(rtfDir, fileName)) as rtfFile:
                    parsedName = re.findall(u'\d+', fileName)
                    date, meetingId = str('/'.join(parsedName[0:3][::-1])), parsedName[-1] if len(parsedName) > 3 else '00'
                    try:
                        doc = Rtf15Reader.read(rtfFile)
                    except Exception:
                        continue

                    for line in PlaintextWriter.write(doc):
                        line = unicode(line, encoding='utf-8')
                        if len(line) < 95 and re.match(approvedLine, line):
                            res = list(set(re.findall(u'\d+', line)) & committeeIds[year])
                            if len(res) > 0:
                                outputFile.write(fileName + ': ' + line.encode('utf-8'))
                                for requestId in res:
                                    dictionary[year][requestId] = [date, meetingId]

    for csvDir, _, csvFiles in os.walk('./csv'):
        for csvFileName in csvFiles:
            if not re.match('\w+\d{4}\.csv$', csvFileName):
                continue

            year = csvFileName[7:11]
            if not year in dictionary:
                continue

            with open(os.path.join(csvDir, csvFileName)) as csvFile:
                with open(os.path.join(csvDir, csvFileName[:-4] + '_out' + csvFileName[-4:]), 'w+') as outputCsv:
                    writer = csv.writer(outputCsv)
                    reader = csv.reader(csvFile)
                    writer.writerow(reader.next() + headerColumns)
                    for row in csv.reader(csvFile):
                        committeeId = '' if len(row) < 9 else row[8]
                        writer.writerow(
                            row + (['', ''] if committeeId not in dictionary[year] else dictionary[year][committeeId]))
Example #47
0
 def extract_content(self, filepath):
     with open(filepath, "rb") as fh:
         doc = Rtf15Reader.read(fh)
     
     return to_utf8(PlaintextWriter.write(doc).getvalue())
def convertRtfToText(path):
	doc = Rtf15Reader.read(open(path))
	return PlaintextWriter.write(doc).getvalue()
Example #49
0
# import sys
# sys.path.append('F:\陶士来文件\downloads\PyRTF-0.45\PyRTF-0.45')
# # from PyRTF import *
# with open('test.rtf','rb') as f:
#     for each in f:
#         print(each)
from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.plaintext.writer import PlaintextWriter
import re
with open('test2.rtf', 'rb') as f:
    # for each in f :
    #     each=re.sub('\n','',each)
    doc = Rtf15Reader.read(f, clean_paragraphs=True)
    print(doc)
    print(PlaintextWriter.write(doc, newline="\n").getvalue(), end="")
Example #50
0
def convertRtfToText(path):
    doc = Rtf15Reader.read(open(path))
    return PlaintextWriter.write(doc).getvalue()
Example #51
0
    def extract_content(self, filepath):
        with open(filepath, "rb") as fh:
            doc = Rtf15Reader.read(fh)

        return to_utf8(PlaintextWriter.write(doc).getvalue())
Example #52
0
def main():
    y = 0
    contains = False
    for a in range(1, 7):
        list = []
        dev = '{}{}'.format("dev", a)
        root = api.path + '/{}/interviews/'.format(dev)
        dirlist = [
            item for item in os.listdir(root)
            if os.path.isfile(os.path.join(root, item))
        ]

        for i in dirlist:
            if i.endswith('.rtf'):
                x = 0
                print root + i
                doc = PlaintextWriter.write(Rtf15Reader.read(
                    open(root + i))).getvalue()
                first_index = doc.find("Clip Transcript") + 21
                second_index = doc.find("Clip Keywords") - 19
                interview_date = doc[first_index - 35:first_index -
                                     25].replace("_", "-")
                clip_transcript = doc[first_index:second_index]
                list.append([interview_date, clip_transcript])

                while x < doc.count("Clip Transcript"):

                    if doc.find("Clip Transcript", second_index) > 0:
                        first_index = doc.find("Clip Transcript",
                                               second_index) + 21
                    else:
                        break
                    if doc.find("Clip Keywords", first_index) > 0:
                        second_index = doc.find("Clip Keywords",
                                                first_index) - 16
                    else:
                        break

                    interview_date = doc[first_index - 35:first_index -
                                         25].replace("_", "-")
                    clip_transcript = doc[first_index:second_index]

                    print interview_date, clip_transcript

                    for sublist in list:
                        if sublist[0] == interview_date:
                            sublist[1] = sublist[
                                1] + "\n------------------\n" + clip_transcript
                            contains = True
                    if not contains:
                        list.append([interview_date, clip_transcript])
                    contains = False

                    x = x + 1

        print list
        # print list[0][0], list[0][1]

        for i in dirlist:
            if i.endswith('.mp3'):

                tag = i[:10].replace("_", "-")

                index = 0
                for sublist in list:
                    if tag.strip() == sublist[0].strip():
                        description = unicode(list[index][1], errors="ignore")
                        contains = True
                    index = index + 1
                if not contains:
                    description = ""
                contains = False

                data_audio = {
                    "description":
                    description,
                    "duration":
                    (MP3(api.path +
                         '/{}/interviews/{}'.format(dev, i))).info.length,
                    "id":
                    "",
                    "interview":
                    api.get('interviews', y + 1),
                    "status":
                    "PRIVATE",
                    "tag":
                    i,
                    "uri":
                    "http://opendata.soccerlab.polymtl.ca/audios/" + i,
                    "author":
                    "",
                    "license":
                    ""
                }
                api.request("Audio", 'audios', data_audio)
                y = y + 1
Example #53
0
def parse_rtf(file):
    doc = Rtf15Reader.read(file)
    return PlaintextWriter.write(doc).getvalue().split('\n')
Example #54
0
from __future__ import absolute_import
from __future__ import print_function
from pyth.plugins.plaintext.writer import PlaintextWriter
import pythonDoc

doc = pythonDoc.buildDoc()

print(PlaintextWriter.write(doc).getvalue())
import glob
import codecs
import sys

from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.plaintext.writer import PlaintextWriter

in_dir, out_dir = sys.argv[1:3]

for in_filename in glob.glob("{}/*.rtf".format(in_dir)):
    in_file = codecs.open(in_filename)
    out_filename = in_filename.split("/")[1].rsplit(".", 1)[0]
    out_file = codecs.open("{}/{}.txt".format(out_dir, out_filename), "w")
    doc = Rtf15Reader.read(in_file)

    PlaintextWriter.write(doc, out_file)

    in_file.close()
    out_file.close()
Example #56
0
    def document_create_index(document, user_id=None):

        import os
        from xlrd import open_workbook
        from pyth.plugins.rtf15.reader import Rtf15Reader
        from pyth.plugins.plaintext.writer import PlaintextWriter
        import sunburnt

        document = json.loads(document)
        table = s3db.doc_document
        id = document["id"]

        name = document["name"]
        filename = document["filename"]

        filename = "%s/%s/uploads/%s" % (os.path.abspath("applications"), \
                                        request.application, filename)

        si = sunburnt.SolrInterface(settings.get_base_solr_url())

        extension = os.path.splitext(filename)[1][1:]

        if extension == "pdf":
            data = os.popen("pdf2txt.py " + filename).read()
        elif extension == "doc":
            data = os.popen("antiword " + filename).read()
        elif extension == "xls":
            wb = open_workbook(filename)
            data=" "
            for s in wb.sheets():
                for row in range(s.nrows):
                    values = []
                    for col in range(s.ncols):
                        values.append(str(s.cell(row, col).value))
                    data = data + ",".join(values) + "\n"
        elif extension == "rtf":
            doct = Rtf15Reader.read(open(filename))
            data = PlaintextWriter.write(doct).getvalue()
        else:
            data = os.popen("strings " + filename).read()

        # The text needs to be in unicode or ascii, with no contol characters
        data = str(unicode(data, errors="ignore"))
        data = "".join(c if ord(c) >= 32 else " " for c in data)

        # Put the data according to the Multiple Fields
        # @ToDo: Also, would change this according to requirement of Eden
        document = {"id": str(id), # doc_document.id
                    "name": data, # the data of the file
                    "url": filename, # the encoded file name stored in uploads/
                    "filename": name, # the filename actually uploaded by the user
                    "filetype": extension  # x.pdf -> pdf is the extension of the file
                    }

        # Add and commit Indices
        si.add(document)
        si.commit()
        # After Indexing, set the value for has_been_indexed to True in the database
        db(table.id == id).update(has_been_indexed = True)

        db.commit()
Example #57
0
    def parse(self):
        doc = Rtf15Reader.read(open(self._config['filePath'], "rb"))
        self.__parserData = PlaintextWriter.write(doc).getvalue()

        return self.__parsedData
Example #58
0
    def document_create_index(document, user_id=None):

        import os
        from xlrd import open_workbook
        from pyth.plugins.rtf15.reader import Rtf15Reader
        from pyth.plugins.plaintext.writer import PlaintextWriter
        import sunburnt

        document = json.loads(document)
        table = s3db.doc_document
        id = document["id"]

        name = document["name"]
        filename = document["filename"]

        filename = "%s/%s/uploads/%s" % (os.path.abspath("applications"), \
                                        request.application, filename)

        si = sunburnt.SolrInterface(settings.get_base_solr_url())

        extension = os.path.splitext(filename)[1][1:]

        if extension == "pdf":
            data = os.popen("pdf2txt.py " + filename).read()
        elif extension == "doc":
            data = os.popen("antiword " + filename).read()
        elif extension == "xls":
            wb = open_workbook(filename)
            data = " "
            for s in wb.sheets():
                for row in range(s.nrows):
                    values = []
                    for col in range(s.ncols):
                        values.append(str(s.cell(row, col).value))
                    data = data + ",".join(values) + "\n"
        elif extension == "rtf":
            doct = Rtf15Reader.read(open(filename))
            data = PlaintextWriter.write(doct).getvalue()
        else:
            data = os.popen("strings " + filename).read()

        # The text needs to be in unicode or ascii, with no contol characters
        data = str(unicode(data, errors="ignore"))
        data = "".join(c if ord(c) >= 32 else " " for c in data)

        # Put the data according to the Multiple Fields
        # @ToDo: Also, would change this according to requirement of Eden
        document = {
            "id": str(id),  # doc_document.id
            "name": data,  # the data of the file
            "url": filename,  # the encoded file name stored in uploads/
            "filename": name,  # the filename actually uploaded by the user
            "filetype": extension  # x.pdf -> pdf is the extension of the file
        }

        # Add and commit Indices
        si.add(document)
        si.commit()
        # After Indexing, set the value for has_been_indexed to True in the database
        db(table.id == id).update(has_been_indexed=True)

        db.commit()
Example #59
0
###########################################################################
# Simple program to convert rtf text file to plain text format
# Input is a file with the .rtf extension
# Output is a plain text file with the same name and the .txt extension
# This program uses the package pyth
# Install by:
#   pip install pyth 
###########################################################################

from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.plaintext.writer import PlaintextWriter
import sys

if __name__ == '__main__':
    if len(sys.argv) != 2:
        print 'Convert an rtf file to plain text'
        print 'Usage : %s filename.rtf' % sys.argv[0]
    else:
        input_file = open(sys.argv[1],'r')
        output_file = open(input_file.name.replace('.rtf','') + '.txt','w')
        doc = Rtf15Reader.read(input_file)
        output_file.write(PlaintextWriter.write(doc).getvalue())
        output_file.close()
        input_file.close()