def computeScores(inputDir, outCSV, acceptTypes):
    
    with open(outCSV, "wb") as outF:
        a = csv.writer(outF, delimiter=',')
        a.writerow(["x-coordinate","y-coordinate","Similarity_score"])        

        files_tuple = itertools.combinations(filterFiles(inputDir, acceptTypes), 2)
        for file1, file2 in files_tuple:
            try:
                row_cosine_distance = [file1, file2]
            
                file1_parsedData = parser.from_file(file1)
                file2_parsedData = parser.from_file(file2)
           
                v1 = Vector(file1, ast.literal_eval(file1_parsedData["content"]))
                v2 = Vector(file2, ast.literal_eval(file2_parsedData["content"]))
            

                row_cosine_distance.append(v1.cosTheta(v2))            

                a.writerow(row_cosine_distance)  
            except ConnectionError:
                sleep(1)
            except KeyError:
                continue
            except Exception, e:
                pass    
Example #2
0
def run_exit_tool_on_known_type(dir_list):
    file_list = get_file_list(dir_list)

    for entry in file_list:
        parser.from_file(entry)

    return
def computeScores(inputDir, outCSV, acceptTypes, allKeys):

    na_metadata = ["resourceName"]
    with open(outCSV, "wb") as outF:
        a = csv.writer(outF, delimiter=',')
        a.writerow(["x-coordinate","y-coordinate","Similarity_score"])

        filename_list = []

        for root, dirnames, files in os.walk(inputDir):
            dirnames[:] = [d for d in dirnames if not d.startswith('.')]
            for filename in files:
                if not filename.startswith('.'):
                    filename_list.append(os.path.join(root, filename))

        filename_list = [filename for filename in filename_list if parser.from_file(filename)]
        if acceptTypes:
            filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes]
        else:
            print "Accepting all MIME Types....."

        files_tuple = itertools.combinations(filename_list, 2)
        for file1, file2 in files_tuple:

            row_edit_distance = [file1, file2]            

            file1_parsedData = parser.from_file(file1)
            file2_parsedData = parser.from_file(file2)
    
            intersect_features = set(file1_parsedData["metadata"].keys()) & set(file2_parsedData["metadata"].keys())                
            intersect_features = [feature for feature in intersect_features if feature not in na_metadata ]

            file_edit_distance = 0.0
            for feature in intersect_features:

                file1_feature_value = stringify(file1_parsedData["metadata"][feature])
                file2_feature_value = stringify(file2_parsedData["metadata"][feature])

                feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value))
                    
                file_edit_distance += feature_distance

            
            if allKeys:
                file1_only_features = set(file1_parsedData["metadata"].keys()) - set(intersect_features)
                file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata]

                file2_only_features = set(file2_parsedData["metadata"].keys()) - set(intersect_features)
                file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata]

                file_edit_distance += len(file1_only_features) + len(file2_only_features)
                file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features))

            else:
                file_edit_distance /= float(len(intersect_features))    #average edit distance

            row_edit_distance.append(1-file_edit_distance)
            a.writerow(row_edit_distance)
def command(in_dir, out_dir, tika_server):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    for fi in in_files:
        if tika_server:
            parsed = parser.from_file(fi, tika_server)
        else:
            parsed = parser.from_file(fi)

        out_file = out_file_name(out_dir, fi, 'txt')
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            f.write(parsed['content'])
Example #5
0
def intersect(json_filename, output_name, index_file, start_index=0, end_index=yaoner.MAX_INT_VALUE):
    base_directory = '/Users/Frank/Desktop/fulldump/raw-dataset/'
    if index_file is None:
        index_file = '/Users/Frank/PycharmProjects/599assignment1/geo-topic-parser-folder/geo-topic-all-files.txt'

    with open(json_filename) as json_file:
        json_data = json.load(json_file)

        concept_dictionary = dict()

        for key in json_data.keys():
            concept_dictionary[key.lower()] = {}

        file_list = yaoner.read_index_file(index_file, base_directory, start_index, end_index)

        for idx, val in enumerate(file_list):
            print(start_index + idx)
            parsed = parser.from_file(''.join([base_directory, val]))
            if 'content' in parsed and parsed['content'] is not None:
                content = parsed['content']
                words = content.split()
                for word in words:
                    lowercased = word.lower()
                    if lowercased in concept_dictionary:
                        last_part = os.path.basename(val)
                        concept_dictionary[lowercased][last_part] = 1
        dump(concept_dictionary, output_name + 'from' + str(start_index) + 'to' + str(end_index) + '.json')

    return
Example #6
0
def extract(path):
  parsed = parser.from_file(path)
  content = parsed["content"]

  ners = StanfordExtractor(content).extract()
  entities = CustomEntityExtractor(content).extract()
  quantities = QuantityExtractor(content).getQuantities()

  if len(ners['LOCATION']) > 0:
    l = GeoTopic(map(lambda l: l['name'], ners['LOCATION']))
    geo = l.getInfo()
    locations = l.getLocations()
  else:
    geo = [ ]
    locations = [ ]

  return {
    'geo' : geo,
    'locations' : locations,
    'entities': entities['entities'],
    'places': ners['LOCATION'],
    'dates': ners['DATE'],
    'quantities': quantities,
    'metadata': parsed['metadata'],
    'mime-type': parsed['metadata']['Content-Type'],
    'id': idf.set(path)
  }
def filterFiles(inputDir, acceptTypes):
    filename_list = []

    for root, dirnames, files in os.walk(inputDir):
        dirnames[:] = [d for d in dirnames if not d.startswith('.')]
        for filename in files:
            if not filename.startswith('.'):
                filename_list.append(os.path.join(root, filename))

    filename_list = [filename for filename in filename_list if parser.from_file(filename)]
    if acceptTypes:
        filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes]
    else:
        print "Accepting all MIME Types....."

    return filename_list
def file_parser(fname, pages=None):
    if magic.from_file(fname, mime=True) == 'application/pdf':
        try:
            text_array = []
            i = 0
            d = pdf.Document(fname)
            for i, p in enumerate(d, start=1):
                for f in p:
                    for b in f:
                        for l in b:
                            text_array.append(l.text.encode('UTF-8'))

                if i >= pages:  # break after x pages
                    break

            log.debug("Processed %i pages (%i max)", i, pages)
            return '\n'.join(text_array)
        except:
            # reraise everything
            raise
    else:
        try:
            content = parser.from_file(fname)['content']
            return (content or '').encode('UTF-8')
        except:
            # reraise everything
            raise
Example #9
0
def compareValueSimilarity (fileDir, encoding = 'utf-8') :
    union_feature_names = set()
    file_parsed_data = {}
    resemblance_scores = {}
    file_metadata={}

    for filename in fileDir:
        file_parsed = []
        parsedData = parser.from_file(filename)
        file_metadata[filename] = parsedData["metadata"]

        for key in parsedData["metadata"].keys() :
            value = parsedData["metadata"].get(key)[0]
            if isinstance(value, list):
                value = ""
                for meta_value in parsedData["metadata"].get(key)[0]:
                    value += meta_value
            file_parsed.append(str(key.strip(' ').encode(encoding) + ": " + value.strip(' ').encode(encoding)))


        file_parsed_data[filename] = set(file_parsed)
        union_feature_names = union_feature_names | set(file_parsed_data[filename])

    total_num_features = len(union_feature_names)
    
    for filename in file_parsed_data.keys():
        overlap = {}
        overlap = file_parsed_data[filename] & set(union_feature_names) 
        resemblance_scores[filename] = float(len(overlap))/total_num_features

    sorted_resemblance_scores = sorted(resemblance_scores.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_resemblance_scores, file_metadata
Example #10
0
def run_ner(start_index=0, end_index=MAX_INT_VALUE):
    index_file = '/Users/Frank/PycharmProjects/599assignment1/geo-topic-parser-folder/geo-topic-all-files.txt'
    base_directory = '/Users/Frank/Desktop/fulldump/raw-dataset/'

    file_list = read_index_file(index_file, base_directory, start_index, end_index)

    measurement_list = []
    index = 0 + start_index
    for entry in file_list:
        print(index)
        parsed = parser.from_file(''.join([base_directory, entry]))
        if 'metadata' in parsed:
            if 'X-TIKA:EXCEPTION:embedded_exception' in parsed['metadata']:
                index += 1
                continue
        if 'content' in parsed:
            if parsed['content'] is not None:
                # print(json.dumps(parsed['metadata'], indent=4))
                # print(parsed['content'])
                # print('content size ', len(parsed['content']))
                if len(parsed['content']) > 1 * 1024 * 1024:
                    index += 1
                    continue
                measurements = extract_measurement(parsed['content'])
                if measurements is not None and len(measurements) > 0:
                    measurement_list.append({entry.split('/')[-1]: measurements})
        index += 1
    dump_to_json(measurement_list, '/Users/Frank/working-directory/ner-measurement-mentions/',
                 'from' + str(start_index) + 'to' + str(end_index))
    return
Example #11
0
def getKeywords(pdfFile,Occur):

   tikaurl= tika_obo.getTikaAddress()
   parsed = parser.from_file(pdfFile, tikaurl)

   metadata = parsed["metadata"]
   doccontent = parsed["content"]

   fullwordlist = obo.stripNonAlphaNum(doccontent)
   wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
   dictionary = obo.wordListToFreqDict(wordlist)
   sorteddict = obo.sortFreqDict(dictionary)
   count = 0
   keywords = [] 
   shortkey = []
   maxoccur = Occur
   for s in sorteddict: 
       numocc = int(s[0])
       word = s[1].encode('utf-8')
       if numocc > maxoccur:
          keyword = { word : str(numocc) }
          keywords.append(keyword)
          if len(word)>6:
             shortkey.append(word.lower())
       count = count + 1
   if Occur > 0:
       return shortkey
   return keywords
def load_topics(filename):
	languages.append(language.from_file(filename))
	parser_obj = parser.from_file(filename)
	if 'content' in parser_obj and parser_obj['content']:
		words.extend(get_nouns(parser_obj['content']))
	if 'metadata' in parser_obj:
		metadata_dict = parser_obj['metadata']
		if 'Author' in metadata_dict:
			if type(metadata_dict['Author']) == type([]):
				metadata.append(metadata_dict['Author'][0])
			else:	
				metadata.append(metadata_dict['Author'])

		if 'xmp:CreatorTool' in metadata_dict:
			if type(metadata_dict['xmp:CreatorTool']) == type([]):
				metadata.extend(metadata_dict['xmp:CreatorTool'])
			else:	
				metadata.append(metadata_dict['xmp:CreatorTool'])

		if 'Content-Type' in metadata_dict:
			if type(metadata_dict['Content-Type']) == type([]):
				metadata.append(metadata_dict['Content-Type'][0])
			else:
				metadata.append(metadata_dict['Content-Type'])
		if 'Company' in metadata_dict:
			if type(metadata_dict['Company']) == type([]):
				metadata.append(metadata_dict['Company'][0])
			else:
				metadata.append(metadata_dict['Company'])
Example #13
0
def search_content(file_path, expressions):
    """Open a file and search it's contents against a set of RegEx."""
    matches = []
    count = 0
    data = parser.from_file(file_path)
    # Read into an I/O buffer for better readline support
    if not data:
        # There is no content that could be extracted
        return matches

    content = io.StringIO(data['content'])
    # TODO this may create a very large buffer for larger files
    # We may need to convert this to a while readline() loop
    for line in content.readlines():
        count += 1  # count the number of lines
        if line:
            for rex in expressions:
                # Check if the line matches all the expressions
                res = rex.regex.search(line)
                if res:
                    # If there's a match append to the list
                    matches.append(cazobjects.CazRegMatch(res,
                                                          file_path,
                                                          count,
                                                          rex.name))
    return matches
Example #14
0
def getTikaTags(filename):
    import tika
    from tika import parser
    import obo
    import tika_obo
    import gethavens

    tikaUrl = getTikaAddress()
    parsed = parser.from_file(filename, tikaUrl)
    metadata = parsed["metadata"]
    content = parsed["content"]
    jsonprops = {'cm:title': str(metadata['resourceName'])}

    for key in metadata:
        newkey = str(key)
        value = str(metadata[key])
        jsonprops[newkey] = value

    title = jsonprops['resourceName']
    namebreak = title.split('.')
    havenrecord = gethavens.getPropertiesHaven(str(jsonprops['resourceName']))
    jsonprops['Description'] = 'Ranked:' + str(havenrecord['rank']) \
       + ' most secretive Tax Haven\nhttps://www.google.co.uk/maps/place/' \
       + havenrecord['country']
    jsonprops['Name'] = havenrecord['country']
    jsonprops['cmis:title'] = str(title)
    jsonprops['cmis:author'] = 'admin'
    return jsonprops
 def _request_pdf_data(self, url):
     parsed = parser.from_file(url)
     return {
         'url': url,
         'title': self._parse_pdf_title(parsed),
         'body': self._parse_pdf_body(parsed)
         }
Example #16
0
def main(file_name):
	fi = open("sentences.txt", "w+")
	fi_summary = open("summary.txt", "w+")
	fi_cool = open("wtv.txt", "w+")
	score_sentences = SentenceScores()

	parsed = parser.from_file(file_name)
	print parsed["metadata"]
	content = parsed["content"]
	content = content.strip()
	fi_cool.write(content.encode("utf-8"))
	sentences = content.split(". ")
	sentences = map(clean_sentence, sentences)
	
	lines = score_sentences.get_summary_lines(sentences)
	max_len = len(lines) / 3
	needed_lines = lines[0:max_len]
	sorted_lines = sorted(needed_lines, key=lambda x: x[0])

	for line_num, score in sorted_lines:
		fi_summary.write((str(line_num+1)+", "+sentences[line_num]).encode("utf-8"))

	for sentence in sentences:
		fi.write(sentence.encode("utf-8"))

	fi.close()
	fi_summary.close()
Example #17
0
	def __init__(self, fileName):
		parsed = parser.from_file(fileName)
		metadata = parsed["metadata"]
		#   Return re.sub('[\s+]', '', content)
		#  TODO: Delete... Very Redundant..
		content = parsed["content"]
		content = content.replace('\n', '')
		content = content.replace('\t', '')
		content = content.replace('\'', '')
		content = content.replace('\"', '')
		rx = re.compile('\W+')
		content = rx.sub(' ', content).strip()
		self.content = content
		#   Title...
		try:
			title = metadata['title']
		except:
		    title = 'Untitled'
		title = title.replace('\t', '')
		title = title.replace('\t', '')
		title = title.replace('\'', '')
		title = title.replace('\"', '')
		title = rx.sub(' ', title).strip()
		self.title = title
		#  self.type = self.metadata['Content-Type-Hint']
		#  self.name = self.metadata['resourceName']
		#  lanFix = re.sub('[\s+]', '', content)
		self.lang = language.from_file(fileName)
    def makeSearchable(self, src, subdir):
        rootDir = subdir + "/examplePDFs"
        pdfPath = rootDir + "/" + "rawPdfs"
        finishedTextPath = rootDir + "/" + "finishedText"
        removed_text_path = rootDir + "/" + "removedText"
        gsPath = rootDir + "/" + "gsPdfs"
        imagesProcessedPath = rootDir + "/" + "imagesProcessed"
        imageText = rootDir + "/" + "imageText"

        if not os.path.exists(pdfPath):
            os.makedirs(pdfPath)
        if not os.path.exists(finishedTextPath):
            os.makedirs(finishedTextPath)
        if not os.path.exists(removed_text_path):
            os.makedirs(removed_text_path)
        if not os.path.exists(gsPath):
            os.makedirs(gsPath)
        if not os.path.exists(imagesProcessedPath):
            os.makedirs(imagesProcessedPath)
        if not os.path.exists(imageText):
            os.makedirs(imageText)

        filename, fileType = src.rsplit(".", 1)
        print("\n**********************")
        print("Processing file: " + filename)
        print("**********************\n")

        # Extact easy text
        print("Getting text that can be easily extracted...")
        rawText = parser.from_file(pdfPath + "/" + src)
        if rawText["content"] is None:
            print("Found no text to extract, continuing process")
        else:
            fileOutput = open(finishedTextPath + "/" + filename + ".txt", 'w')
            fileOutput.write(rawText["content"].encode("utf-8"))
            fileOutput.close()

        # Remove text from pdf
        print("Removing text from pdf")
        process1 = subprocess.Popen(['java', '-jar', 'PdfTextDeleter.jar', src, os.path.join(removed_text_path, src)])
        process1.wait()

        # Apply ghostscript to removed text pdfs
        if not os.path.exists(gsPath + "/" + filename + "-imgs"):
            os.makedirs(gsPath + "/" + filename + "-imgs")
        if not os.path.exists(rootDir + "/imagesProcessed/" + filename + "-imgs"):
            os.makedirs(rootDir + "/imagesProcessed/" + filename + "-imgs")
        if not os.path.exists(rootDir + "/imageText/" + filename + "-imgs"):
            os.makedirs(rootDir + "/imageText/" + filename + "-imgs")
        print("Converting left over pdf to images")
        process2 = subprocess.Popen(["gs", "-dNOPAUSE", "-sFONTPATH=/opt/local/share/ghostscript/9.16/Resource/Font/",
                   "-sDEVICE=pngalpha", "-r300", "-dBATCH", "-sOutputFile=" + gsPath + "/" + filename + "-imgs" + "/" + filename + "-%03d" ".png",
                   removed_text_path + "/" + src], env={'PATH': '/opt/local/bin/'})
        process2.wait()
        self.preprocessImages(rootDir, subdir, src)
        self.applyOCRToImages(rootDir, subdir, src)
        self.mergeTextFiles(rootDir, subdir, src)
Example #19
0
 def parse_file(self, path):
     """
     Parses a file at given path
     :param path: path to file
     :return: parsed content
     """
     parsed = tkparser.from_file(path)
     parsed['file'] = os.path.abspath(path)
     return parsed
def computeScores(inputDir, outCSV, acceptTypes):
    
    with open(outCSV, "wb") as outF:
        a = csv.writer(outF, delimiter=',')
        a.writerow(["x-coordinate","y-coordinate","Similarity_score"])        

        files_tuple = itertools.combinations(filterFiles(inputDir, acceptTypes), 2)
        for file1, file2 in files_tuple:

            row_cosine_distance = [file1, file2]
            
            file1_parsedData = parser.from_file(file1)
            file2_parsedData = parser.from_file(file2)

            v1 = Vector(file1_parsedData["metadata"])
            v2 = Vector(file2_parsedData["metadata"])

            row_cosine_distance.append(v1.cosTheta(v2))            

            a.writerow(row_cosine_distance)  
Example #21
0
def test_tika_solr():
    s = create_connection(u"Test")

    file_path = u"testdata/example.pdf"

    parsed = parser.from_file(file_path)

    log_parsed(parsed)

    s.add([parsed], commit=True)

    return 1, 0
Example #22
0
def convert( filepath, output ):
    parsed = parser.from_file( filepath )

    if output:
        basename, ext_pdf = os.path.splitext( os.path.basename( filepath ) )
        output_path = os.path.join( output, basename + '.json' )
    else:
        extensionless_filepath, ext_pdf = os.path.splitext( filepath )
        output_path = extensionless_filepath + '.json'

    with open( output_path, 'wt' ) as textfile:
        json.dump( parsed, textfile, ensure_ascii=True )
def get_measurements(filename):
	parser_obj = parser.from_file(filename)
	if 'content' in parser_obj and parser_obj['content']:
		return [x for x in regextagger.tag(tokenizer.tokenize(parser_obj['content'])) if x[1] != 'OTHER']

	f_read=open(sys.argv[1],'r')
    	given_text=f_read.read();
    	segmented_lines=nltk.sent_tokenize(given_text) 
    	for text in segmented_lines:
        	words=word_tokenize(text)
        		sent = t_gram_tag.tag(words)
        		print given_text(sent)
Example #24
0
 def extractText(self, params):
     '''
     Using Tika to extract text from given file
     and return the text content.
     '''
     file_name = params['file_name']
     parsed = parser.from_file(file_name)
     status = IndexUploadedFilesText(file_name, parsed["content"])
     if status[0]:
         return {'job':'text_extraction', 'status': 'successful', 'comment':'Text extracted and indexed to Solr.'}
     else:
         return {'job':'text_extraction', 'status': 'unsuccessful', 'comment':status[1]}
Example #25
0
def main():

    # read the folder name from argument
    arg_parser = argparse.ArgumentParser(description='Detecting near duplicates using SimHashes')
    arg_parser.add_argument('-f', '--folder', help='Folder with all the images', required=True)
    arg_parser.add_argument('-t', '--tika', help='Path to a running tika server', required=True)
    arg_parser.add_argument('-cb', '--contentbased', help='true/false. Use content in deduplication calculation. Default = false (Must have Tika OCR enabled/ Have Tesseract Installed)', required=False)
    args = arg_parser.parse_args()

    SIM_HASH = defaultdict()
    # read all files
    for root, dirs, files in os.walk(args.folder):
        # grab metadata from each file and write it to an output file
        files = sorted(files)
        for f in files:
            path = root + f
            parsed_data = parser.from_file(path, args.tika)
            if args.contentbased:
                if args.contentbased.lower() == 'true':
                    SIM_HASH[f] = get_simhash(parsed_data, True)
            else:
                SIM_HASH[f] = get_simhash(parsed_data, False)

    # make clusters
    SORTED_HASH = sorted(SIM_HASH.items(), key=operator.itemgetter(1))
    DISTANCES = OrderedDict()
    DISTANCES[SORTED_HASH[0][0]] = None

    for i in range(1, len(SORTED_HASH)):
        DISTANCES[SORTED_HASH[i][0]] = simhash.get_hamming_distance(SORTED_HASH[i - 1][1], SORTED_HASH[i][1])

    # cluster images together
    cluster_number = 0
    CLUSTERS = defaultdict(list)
    for key, value in DISTANCES.iteritems():
        print key + ": " + str(value)
        if value is None:
            CLUSTERS[cluster_number].append(key)
        else:
            if value <= THRESHOLD:
                CLUSTERS[cluster_number].append(key)
            else:
                cluster_number += 1
                CLUSTERS[cluster_number].append(key)

    print '*' * 10 + 'CLUSTERS' + '*' * 10
    for key, value in CLUSTERS.iteritems():
        print 'CLUSTER ' + str(key) + ':'
        for x in value:
            print '\t' + x

    return
def parse_files(file_name):
    print("parsing file : % \n", file_name) 

    parsed = parser.from_file(file_name)


    print("meta-data:\n")
    print(parsed["metadata"])
    print("content:\n")
    content = parsed["content"]
    c2 = content.encode('utf-8').strip()
    print(c2)
    print("\n\n");
Example #27
0
def extract_text(request, file_name):
    '''
        Using Tika to extract text from given file
        and return the text content.
    '''
    if "none" in IndexStatus("text", file_name):
        parsed = parser.from_file("{0}/{1}/{2}".format(APP_NAME, UPLOADED_FILES_PATH, file_name))
        status = IndexUploadedFilesText(file_name, parsed["content"])
        if status[0]:
            return HttpResponse(status=200, content="Text extracted.")
        else:
            return HttpResponse(status=400, content="Cannot extract text.")
    else:
        return HttpResponse(status=200, content="Loading...")
Example #28
0
def scan(filelist, conf=DEFAULTCONF):
    results = []

    for f in filelist:
        metadata = parser.from_file(f).get('metadata', {})
        for field in conf['remove-entry']:
            if field in metadata:
                del metadata[field]
        results.append((f, metadata))

    metadata = {}
    metadata["Name"] = NAME
    metadata["Type"] = TYPE
    return results, metadata
    def extractParsingInfo(self):
        FileSizeList = []

        # Getting the files whose size would be computed
        response = MIME_Core().facetQuery('metadata')
        mimeTypeResponse = response.result.dict['facet_counts']['facet_fields']['metadata']


        mimeList = []
        for mime_type, count in mimeTypeResponse.iteritems():
            if mime_type == 'application/java-archive':
                continue
            mimeList.append(mime_type)


        mime_size_diversity = {}
        for mime in mimeList:
            metadata_list = {}
            print mime[mime.index('/')+1:]

            query = 'metadata:%s' % (mime)
            response = MIME_Core().queryAll(query=query, rows = 100)
            files = response.result.dict['response']['docs']

            for file in files:
                parsed = parser.from_file(file['file'][0])
                if 'metadata' in parsed:
                    metadata = parsed['metadata']

                    for key,value in metadata.iteritems():
                        if key in mime_size_diversity:
                            mime_size_diversity[key] += 1
                        else:
                            mime_size_diversity[key] = 1
                    pass
            print 'done with ' + mime

        top_metadata = sorted(mime_size_diversity.items(), key=operator.itemgetter(1), reverse=True)

        metadata = []
        for item in top_metadata[:20]:
            metadata.append(item[0])
            metadata.append(item[1])
            pass

        out_file = open('data/word_cloud/word_cloud.json',"w")
        json.dump(metadata,out_file, indent=4)


        pass
	def spell_checker(self, page, stopwordsList):
		driver = self.driver
		driver.implicitly_wait(2)
		driver.get(page)
		self.f.write('--- checking for spelling %s\n' %page)
		allTextOnPage = parser.from_file(page)['content'].encode('utf-8')
		allTextOnPage = re.findall('[a-z]+', allTextOnPage.lower()) 
		stopwordsList.extend(stopwords.words('english'))
		allTextOnPage = [w for w in allTextOnPage if not w in stopwordsList]

		for word in allTextOnPage:
			if not wordnet.synsets(word):
				print 'Is this correct? ', word
				self.f.write('Is this word correct? %s\n' %word)
Example #31
0
        print(check[1])
        msg_check = True
        print('msg_check is True')
    except:
        msg_check = False
        print('msg_check is False')
    return msg_check


#looping through the source folder and opening pdf files
for filename in os.listdir('./source'):
    if filename.endswith(".pdf"):
        print(os.path.join(filename))
        new_filename = (os.path.join(filename))
        #opening the target PDF
        reader = parser.from_file('./source/' + filename)
        pdfText = reader['content']
        cal_item = cal_check(pdfText)
        if cal_item is False:
            #getting the date
            date_target = pdfText.split('Sent: ')
            date_target = date_target[1].split('\n')
            date_target_final = date_target[0]
            date_entry = parse(date_target_final, fuzzy_with_tokens=True)
            #getting the recipients
            to_target = pdfText.split('To: ')
            to_target = to_target[1].split('\n')
            to_entry = to_target[0]
            try:
                cc_target = pdfText.split('Cc: ')
                cc_target = cc_target[1].split('\n')
Example #32
0
from tika import parser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser

path = "C:\Thameem\Testing\xxx\security-guide.pdf"
parsed_txt = parser.from_file(path)
text = parsed_txt["content"]
print(text)

if path.split(".")[1] == "pdf":
    fp = open(path, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    ToC_list = [i[1] for i in doc.get_outlines()]
    print(ToC_list)

# lst =[i for i in lst if re.search("[A-Za-z]{3,}",i) and re.sub("^[0-9]+","",i).strip() not in Filtered_ToC_list]
#     print(lst)
#     processed_txt= " ".join([re.sub("'|\"|\\\\","",x) for x in lst]
# text = " ".join([s for s in text.split("\n") if s and])
# print(text)
Example #33
0
    new_file = list(set(list2).difference(set(list1)))   # 如果更新了,new_file为新增文件的list
    list1 = copy.deepcopy(list2)                         # list1 更新

    for file in new_file:                                # 依次处理新增的每个文件
        print("\n新增文件:", file)
        extension_name = os.path.splitext(file)[1]       # 获取文件后缀对文件进行判断
        if extension_name != '.pdf':
            print("非pdf文件!")
            continue
        file_name = os.path.splitext(file)[0]            # 获取文件名
        print("正在处理:", file_name, "请稍等。。。")

        pdf_file = spy_folder + '/' + file     # pdf_file 为源文件的完整地址
        word_file = config['save_folder'] + '/' + file_name + '.docx'     # word_file 为文件存储地址
        text_file = config['save_folder'] + '/' + file_name + '.txt'     # word_file 为文件存储地址
        parse_entire_pdf = parser.from_file(pdf_file, xmlContent=True)
        parse_entire_pdf = parse_entire_pdf['content']
        content = html_to_plain_text(parse_entire_pdf)

        pattern = re.compile(".*" + key_word + ".*", re.I)
        result = pattern.findall(content)
        print("共计段数:", len(result))

        result_csv = list()
        result_csv.append("标题:" + file_name)
        # result_csv.append("作者:" + file_name + "\n")
        result_text = "标题:" + file_name
        # result_text += "\n作者:" + file_name + "\n"

        cnt = 1
        for para in result:
Example #34
0
{q_num}
\numeric off
:
\end_layout'''
    get_path = r'C:\Users\roybo\Desktop\University\semester 2'
    create_folder_path = r'C:\Users\roybo\OneDrive\University'
    dups_in_create = True
    subject = None
    hw_number = None
    number_quest = None
    get_lines = 5
    assignment_path = None
    if subject is None or number_quest is None:
        assignment_path = assignment_path if assignment_path is not None else sys.argv[
            1]
        assignment_text = parser.from_file(assignment_path)['content'].strip()
    # try:
    #     assignment_text = textract.process(assignment_path).decode('UTF-8')
    # except UnicodeDecodeError as e:
    #     print(parser.from_file(assignment_path))

    if subject is None:
        calculated_subject = get_subject(assignment_text)
        if calculated_subject is None:
            raise Exception("can't detect subject")
        else:
            subject = calculated_subject
    if number_quest is None:
        calculated_questions = count_questions(assignment_text, subject)
        if calculated_questions is None:
            raise Exception("can't detect question number")
Example #35
0
        res = pattern.findall(text)
        res = set([''.join(i) for i in res])
        for item in res:
            if item.startswith('Fig'):
                figures[page] += 1
        if figures[page] == 0:
            s = set()
            pattern = re.compile(r'(Figure|Table|Fig\.)\s*(\d+)(\.[A-Z]{0,1})')
            res = pattern.findall(text)
            res = set(
                [''.join(i).replace('.', ':').split(':')[0] for i in res]) - s
            s = s | res
            for item in res:
                if item.startswith('Fig'):
                    figures[page] += 1

    return figures, tables, equations


if __name__ == "__main__":
    pdfFileObject = open(pdf, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObject)
    count = pdfReader.numPages
    # page-wise feature
    figures, tables, formulas = parsevec(pdf, pdfReader, count)
    raw = parser.from_file(pdf)
    text = str(raw['content']).split('\n')
    title = word_tokenize(text[0].rstrip('\n'))
    # paper-wise feature
    wordf = getWordsf(title, text[1:])
    references = countReferences(text)
Example #36
0
 def get_transactions(self, trans_file):
     parsed = parser.from_file(trans_file)
     #print(parsed["content"])
     prev = None
     fund = None
     last_dt_trans = list()
     transactions = dict()
     for l in parsed['content'].splitlines():
         if l.strip() == '':
             continue
         if 'TransactionDate' in l and prev:
             fund = prev.replace('TRANSACTION SUMMARY FOR ', '')
         elif 'Opening Balance' in l:
             continue
         elif 'Closing Balance' in l:
             ldt = get_date_or_none_from_string(l.split(' ')[0], '%d-%b-%Y')
             for t in last_dt_trans:
                 t['date'] = ldt
                 transactions[fund].append(t)
             last_dt_trans.clear()
             fund = None
         elif fund:
             tran = dict()
             description = ''
             field = 0
             for i, token in enumerate(l.split(' ')):
                 if i == 0:
                     dt = get_date_or_none_from_string(token, '%d-%b-%Y')
                     if dt:
                         tran['date'] = dt
                         field = 1
                     else:
                         if description == '':
                             description = token
                         else:
                             description += ' ' + token
                         field = 1
                 else:
                     temp = get_float_or_none_from_string(token)
                     if not temp and temp != 0:
                         if description == '':
                             description = token
                         else:
                             description += ' ' + token
                     else:
                         if field == 1:
                             tran['units'] = temp
                             tran['description'] = description
                         elif field == 2:
                             tran['nav'] = temp
                         elif field == 3:
                             tran['trans_amount'] = temp
                         field += 1
             if not fund in transactions:
                 transactions[fund] = list()
             if 'date' in tran:
                 transactions[fund].append(tran)
             else:
                 last_dt_trans.append(tran)
         else:
             print(f'ignore {l}')
         prev = l
     return transactions
def computeScores(inputDir, outCSV, acceptTypes, allKeys):

    na_metadata = ["resourceName"]
    with open(outCSV, "wb") as outF:
        a = csv.writer(outF, delimiter=',')
        a.writerow(["x-coordinate","y-coordinate","Similarity_score"])

        filename_list = []

        for root, dirnames, files in os.walk(inputDir):
            dirnames[:] = [d for d in dirnames if not d.startswith('.')]
            for filename in files:
                if not filename.startswith('.'):
                    filename_list.append(os.path.join(root, filename))
        try:
            filename_list = [filename for filename in filename_list if "metadata" in parser.from_file(filename)]
        except ConnectionError:
            sleep(1)

        

        if acceptTypes:
            filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8').decode('utf-8')).split('/')[-1] in acceptTypes]
        else:
            print("Accepting all MIME Types.....")

        files_tuple = itertools.combinations(filename_list, 2)
        for file1, file2 in files_tuple:
            try:           
                row_edit_distance = [file1, file2]            

                file1_parsedData = parser.from_file(file1)
                file2_parsedData = parser.from_file(file2)
                
                intersect_features = set(file1_parsedData["metadata"].keys()) & set(file2_parsedData["metadata"].keys()) 
                            
                intersect_features = [feature for feature in intersect_features if feature not in na_metadata ]

                file_edit_distance = 0.0
                for feature in intersect_features:

                    file1_feature_value = stringify(file1_parsedData["metadata"][feature])
                    file2_feature_value = stringify(file2_parsedData["metadata"][feature])

                    if len(file1_feature_value) == 0 and len(file2_feature_value) == 0:
                        feature_distance = 0.0
                    else:
                        feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value))
                    
                    file_edit_distance += feature_distance

            
                if allKeys:
                    file1_only_features = set(file1_parsedData["metadata"].keys()) - set(intersect_features)
                    file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata]

                    file2_only_features = set(file2_parsedData["metadata"].keys()) - set(intersect_features)
                    file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata]

                    file_edit_distance += len(file1_only_features) + len(file2_only_features)       # increment by 1 for each disjunct feature in (A-B) & (B-A), file1_disjunct_feature_value/file1_disjunct_feature_value = 1
                    file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features))

                else:
                    file_edit_distance /= float(len(intersect_features))    #average edit distance

                row_edit_distance.append(1-file_edit_distance)
                a.writerow(row_edit_distance)

            except ConnectionError:
                sleep(1)
            except KeyError:
                continue
Example #38
0
import q
import re
import traceback

if False:

    " JVM and tika "

    import tika
    tika.initVM()

    from tika import parser

    raw = parser.from_file('./data/20190417181644xqqs.pdf', xmlContent=True)
    print(raw["metadata"])
    print(raw['content'])
    q.d()
    open("aaaa.html", "w").write(raw['content'])

if True:
    """
        https://github.com/pikepdf/pikepdf/
        操作对象是 整个 pdf 的页面
        不支持中文,没找到 encoding 设置
        文档 JUST LIKE SHIT
    """

    import pikepdf

    # Elegant, Pythonic API
    # with pikepdf.open('./data/Paperid975.pdf') as pdf:
Example #39
0
def analysis():
    directory = os.fsencode(directory_path)
    bow_df = pd.DataFrame(columns=headings_bow)  # Create empty table for bow
    report = docx.Document()  # Create report document
    report.add_heading(f'Analysis {os.path.basename(directory_path)}',
                       0)  # Add title to report
    for file in os.listdir(directory):
        document_path = os.path.join(directory, file).decode()
        document = parser.from_file(document_path)  # Retrieve text from file
        document = document['content']
        content = sub(r'http\S+', " ", document)  # Delete all links
        content = sub(
            "[^a-zA-Z0-9|^-]", " ",
            content).lower()  # Delete all punctuation/upper case letters
        content_words = tokenize.word_tokenize(
            content)  # Split words into list
        language_name = language[classify(content)[0]]  # Detect text language
        content_words_core = " ".join(
            filter(lambda x: x in keywords, content_words)).split(
            )  # Delete all words except for words in keywords
        vector = vectorize(content_words_core)  # Count occurrence of keywords
        filename_first = os.fsdecode(file)[
            0:3]  # Select first 3 characters of filename
        vector.insert(
            0, language_name.capitalize())  # Add language to vector-list
        vector.insert(0, filename_first
                      )  # Add first 3 characters of filename to vector-list
        bow = pd.DataFrame(
            vector).transpose()  # Put vector-list into table and transpose
        bow.columns = headings_bow  # Add headings to table
        bow_df = pd.concat([bow_df, bow])  # Add table to table of all files
        bow_df[keywords] = bow_df[keywords].astype(
            'int64')  # Change datatype in table to integer
    bow_df.loc[:, 'Total'] = bow_df.sum(numeric_only=True,
                                        axis=1)  # Add totals column
    bow_df.sort_values(
        by=['Total'], inplace=True,
        ascending=False)  # Sort table on descending total column
    table_bow = report.add_table(bow_df.shape[0] + 1,
                                 bow_df.shape[1])  # Add template table
    for j in range(bow_df.shape[-1]):
        table_bow.cell(0, j).text = bow_df.columns[j]  # Add headers to table
    for i in range(bow_df.shape[0]):
        for j in range(bow_df.shape[-1]):
            table_bow.cell(i + 1,
                           j).text = str(bow_df.values[i,
                                                       j])  # Add data to table
    table_bow.style = 'Light Shading'  # Change style of table

    for file in os.listdir(directory):
        document_path = os.path.join(directory, file).decode()
        document = parser.from_file(document_path)  # Retrieve text from file
        document = document['content']
        content = sub(r'http\S+', " ", document)  # Delete all links
        content = sub("[^a-zA-Z|^-]", " ", content).lower(
        )  # Delete all punctuation/upper case letters/numbers
        content_words = [
            w for w in content.split() if len(w) > 1
        ]  # Delete all words with one letter and split words into list
        language_name = language[classify(content)[0]]  # Detect text language
        content_words_core = [
            w for w in content_words if w not in stopwords.words(language_name)
        ]  # Delete adverbs
        stemmed_words = [
            SnowballStemmer(language_name).stem(word)
            for word in content_words_core
        ]  # Group different forms of a word to a single item
        fdist1 = FreqDist(stemmed_words)  # Count occurrence of words
        top_10_words = pd.DataFrame(fdist1.most_common(10),
                                    columns=['Word', 'Count'
                                             ])  # Put top 10 words in table
        filename = os.fsdecode(file)  # Retrieve filename
        report.add_heading(filename, level=1)  # Add subtitle per document
        report.add_paragraph(
            f'Language: {language_name.capitalize()}')  # Add language
        table = report.add_table(top_10_words.shape[0] + 1,
                                 top_10_words.shape[1])  # Add template table
        for j in range(top_10_words.shape[-1]):
            table.cell(
                0, j).text = top_10_words.columns[j]  # Add headers to table
        for i in range(top_10_words.shape[0]):
            for j in range(top_10_words.shape[-1]):
                table.cell(i + 1, j).text = str(
                    top_10_words.values[i, j])  # Add data to table
        table.style = 'Light Shading'  # Change style of table

    report.save(f'{os.environ["USERPROFILE"]}/Desktop/report.docx'
                )  # Save document to desktop
Example #40
0
import tika
from tika import parser
from os import listdir
from os.path import join, isfile

# mielőtt a szkriptet indítod, el kell indítani a Tika-t
# java -jar tika-app-1.18.jar -s
tika.initVM()
in_path = 'data/raw'
out_path = 'data/interim'
raw_texts = [f for f in listdir(in_path) if isfile(join(in_path, f))]
for raw_text in raw_texts:
    parsed = parser.from_file(join(in_path, raw_text))
    plain_text = parsed['content'].strip()
    fname = raw_text.split('.')[0] + '.txt'
    with open(join(out_path, fname), 'w') as f:
        f.write(plain_text)
Example #41
0
from io import StringIO
from bs4 import BeautifulSoup
from tika import parser
import re
import json

movePdf = []
data = parser.from_file(
    'd:/dokumenter/pokemon/Homebrew_Rules_for_Abilities_and_Moves.pdf',
    xmlContent=True)
xhtml_data = BeautifulSoup(data['content'], "html.parser")
for page, content in enumerate(
        xhtml_data.find_all('div', attrs={'class': 'page'})):
    if 145 > page >= 50:
        _buffer = StringIO()
        _buffer.write(str(content))
        parsed_content = parser.from_buffer(_buffer.getvalue())
        _buffer.truncate()
        _buffer.close()
        movePdf.append({
            'id': 'page_' + str(page + 1),
            'content': parsed_content['content']
        })

movesDict = {}

translateKeyDict = {
    'Move': 'Name',
    'Class': 'DType',
    'Frequency': 'Freq',
    'Effect': 'Effects',
Example #42
0
from tika import parser

raw = parser.from_file(
    'C:\\Users\\InnSight\Documents\\Github\\Gloomhaven-Deck-Builder\\ghclass\\BR\\BR Cards.pdf'
)
content = raw['content'].split('Brute\n')
n = 5
list(filter(None, content[17].split('\n')))
Example #43
0
def convert_pdf_to_txt(path):
    raw = parser.from_file(path)
    text = (raw['content'])
    print(text)
Example #44
0
    def download_doronavirus_Data(self):
        # Chromeを起動する
        driver = webdriver.Chrome(self.DRIVER_PATH)

        # リンク名の定義
        link_name1 = u'国内事例における都道府県別の患者報告数'
        link_name2 = u'国内における都道府県別の'

        # 取得する日付の配列を作成(yyyy-MM-dd HH:mm:ss.SSSSSSSSS)
        dt_now = datetime.datetime.now()
        # 半角対応用(令和2年5月1日版)
        date_list = []
        # 全角(月)対応用1(令和2年5月1日版)
        date_list_halfwidth1 = []
        # 全角(日)対応用2(令和2年5月1日版)
        date_list_halfwidth2 = []
        # 全角対応用(令和2年5月1日版)
        date_list_fullwidth = []
        # yyyymmdd型テキスト名保存用
        filename_list = []
        i = 0
        while (1):
            dt_now_day = dt_now.day - i
            date_list.insert(
                0, '令和2年' + str(dt_now.month) + '月' + str(dt_now_day) + '日版')
            date_list_halfwidth1.insert(
                0, '令和2年' + str(dt_now.month).translate(
                    str.maketrans(
                        {chr(0x0021 + i): chr(0xFF01 + i)
                         for i in range(94)})) + '月' + str(dt_now_day) + '日版')
            date_list_halfwidth2.insert(
                0,
                '令和2年' + str(dt_now.month) + '月' + str(dt_now_day).translate(
                    str.maketrans(
                        {chr(0x0021 + i): chr(0xFF01 + i)
                         for i in range(94)})) + '日版')
            date_list_fullwidth.insert(
                0, '令和2年' + str(dt_now.month).translate(
                    str.maketrans(
                        {chr(0x0021 + i): chr(0xFF01 + i)
                         for i in range(94)})) + '月' +
                str(dt_now_day).translate(
                    str.maketrans(
                        {chr(0x0021 + i): chr(0xFF01 + i)
                         for i in range(94)})) + '日版')
            dt_now_yyyymmdd = dt_now + datetime.timedelta(days=-i)
            filename_list.insert(0, dt_now_yyyymmdd.strftime('%Y%m%d'))
            print(date_list)
            print(date_list_halfwidth1)
            print(date_list_halfwidth2)
            print(date_list_fullwidth)
            print(filename_list)
            if str(i + 1) == str(dt_now.day):
                break
            i += 1

        # カウント変数
        i = 0
        # 取得する日付数だけループ
        for data in date_list:
            # URLを開く
            # Googleのページを開く --- (*2)
            driver.get(
                'https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000121431_00086.html'
            )
            # ページが開くまで待つ --- (*3)
            time.sleep(1)

            try:
                # ある日付のコロナウイルス感染症の現状をダウンロード(半角対応用(令和2年5月1日版))
                driver.find_element_by_partial_link_text(data).click()
            except Exception:

                # 2秒待機
                time.sleep(2)
                try:
                    # 全角(月)対応用1(令和2年5月1日版)に変換して再検索
                    driver.find_element_by_partial_link_text(
                        date_list_halfwidth1[i]).click()
                except Exception:

                    try:
                        # 全角(日)対応用2(令和2年5月1日版)に変換して再検索
                        driver.find_element_by_partial_link_text(
                            date_list_halfwidth2[i]).click()
                    except Exception:

                        try:
                            # 全角対応用(令和2年5月1日版)に変換して再検索
                            driver.find_element_by_partial_link_text(
                                date_list_fullwidth[i]).click()

                        except Exception:
                            print(data + 'は見つかりませんでした。')
                            continue

            # 2秒待機
            time.sleep(2)
            # pdfファイル先に遷移する
            try:
                driver.find_element_by_partial_link_text(link_name1).click()

            except Exception:
                driver.find_element_by_partial_link_text(link_name2).click()

            # 5秒待機
            time.sleep(5)
            # pdfのurlを取得する
            cur_url = driver.current_url

            # pdfをtxtに変換する
            file_data = parser.from_file(cur_url)
            text = file_data["content"]
            # print(text)
            text_path = self.FILE_PATH + self.SEPALATE + 'result_' + data + '.txt'
            file = open(text_path, 'w')
            file.write(text)
            file.close()

            # textを編集する
            # 都道府県データを作成する
            DATA = """
            北海道
            青森県
            岩手県
            宮城県
            秋田県
            山形県
            福島県
            茨城県
            栃木県
            群馬県
            埼玉県
            千葉県
            東京都
            神奈川県
            新潟県
            富山県
            石川県
            福井県
            山梨県
            長野県
            岐阜県
            静岡県
            愛知県
            三重県
            滋賀県
            京都府
            大阪府
            兵庫県
            奈良県
            和歌山県
            鳥取県
            島根県
            岡山県
            広島県
            山口県
            徳島県
            香川県
            愛媛県
            高知県
            福岡県
            佐賀県
            長崎県
            熊本県
            大分県
            宮崎県
            鹿児島県
            沖縄県
            """
            #textの空白行と行を除く
            path = text_path
            output = ''
            with open(path) as f:
                for s_line in f:
                    pre = re.sub('^([^\s]*).*', '\\1', s_line)
                    if pre in DATA and re.sub('\s', '', pre) != '':
                        s_line2 = re.sub(
                            '^([^\s]*)\s([^\s]*)\s([^\s]*)\s([^\s]*).*',
                            '\\1,\\2,\\3,\\4', s_line)
                        str2 = re.sub('.*,(.*),.*,.*', '\\1', s_line2)
                        str2 = re.sub('\r|\n', '', str2)
                        if str2.isdigit():
                            s_line3 = re.sub('(.*),(.*),(.*),.*',
                                             '\\1,\\2,\\3', s_line2)
                        else:
                            s_line3 = re.sub('(.*),.*,(.*),(.*)',
                                             '\\1,\\2,\\3', s_line2)
                        output = output + s_line3
                    else:
                        pre = re.sub('^[^\s]*\s([^\s]*).*', '\\1', s_line)
                        if pre in DATA and re.sub('\s', '', pre) != '':
                            s_line2 = re.sub(
                                '^[^\s]*\s([^\s]*)\s([^\s]*)\s([^\s]*)\s[^\s]*.*',
                                '\\1,\\2,\\3', s_line)
                            output = output + s_line2
            file = open(self.FILE_PATH + self.SEPALATE + filename_list[i] +
                        '.txt',
                        'w',
                        encoding="utf-8")
            file.write(output)
            f.close()
            file.close()

            os.remove(path)

            print(data + 'は見つかりました。')
            i += 1

        # 2秒待機
        time.sleep(2)
        # driverを閉じる
        driver.close()
        driver.quit()
Example #45
0
#Author: Dhivyabharathi Ramasamy
#This script extracts text from pdf using tika. Set path as required.

# Import statements
from urllib.request import urlopen
from tika import parser
import requests

#Set path
pdf_path = os.getcwd() + '/data/pdf/'
extract_path = os.getcwd() + '/data/extracted_files/'

# Extract pdf to text
files = glob(pdf_path + '*.pdf')
print("files found ", len(files))

count = 0
for file in files:
    filename = file.split(pdf_path)[1]
    try:
        raw = parser.from_file(pdf_path + filename)
        with open(extract_path + filename, 'w+') as f:
            f.write(raw['content'])
        count += 1
    except:
        print("unable to parse ", filename)

print("files extracted ", count)
print('Done')
Example #46
0
 def get_metadata(self):
     parsed = parser.from_file(self.get_file(self.url))
     return parsed["metadata"]
Example #47
0
#!/usr/bin/env python

from tika import parser
parsed = parser.from_file('financials.pdf')

with open('financials_metadata.txt', 'w') as file:
    file.write(str(parsed["metadata"]))

with open('financials_content.txt', 'w') as file:
    file.write(parsed["content"])
Example #48
0
 def get_text(self):
     parsed = parser.from_file(self.get_file(self.url))
     return parsed["content"]
Example #49
0
import tika
tika.initVM()
from tika import parser
parsed = parser.from_file('/path/to/file')
#print parsed["metadata"]
print parsed["content"]
def pdftotext_converter(filename):
    raw = parser.from_file(filename)
    return(raw['content']) #returns a string containing the plain text of a PDF file
Example #51
0
def insert_pdf(args):
    buf = StringIO()
    with redirect_stdout(buf), redirect_stderr(buf):
        pdf_path, engine_string_, engine2_string_ = args
        pdf_path = Path(pdf_path)
        engine_ = create_engine(engine_string_)
        engine2_ = create_engine(engine2_string_)

        def get_number_of_pages():
            with pdf_path.open("rb") as pdf:
                reader = PyPDF2.PdfFileReader(pdf)
                if reader.isEncrypted:
                    reader.decrypt("")
                total_pages = reader.getNumPages()
                return total_pages

        def check_if_file_is_in_db_already():
            with engine_.connect() as conn_:
                stmt = text(
                    "SELECT * FROM pcmr.pdfs WHERE pdfName = :pdf_name;")
                result_ = conn_.execute(stmt, {"pdf_name": pdf_path.stem})
                return True if result_.rowcount > 0 else False

        # noinspection SqlResolve
        def get_pdf_metadata():
            stmt = text(
                "SELECT ParentID, DataID, CreateDate FROM Regulatory_Untrusted._RegDocs.DTreeCore "
                "WHERE Name LIKE :file_name;")
            with engine2_.connect() as conn_:
                df = pd.read_sql(stmt,
                                 conn_,
                                 params={"file_name": pdf_path.stem + "%"})
            return df.to_dict("records")[0]

        try:
            if check_if_file_is_in_db_already():
                return

            metadata = get_pdf_metadata()
            metadata["pdf_name"] = pdf_path.stem
            metadata["pdf_size"] = int(
                pdf_path.stat().st_size / 1024 / 1024 * 100) / 100
            metadata["total_pages"] = get_number_of_pages()
            metadata["xmlContent"] = parser.from_file(
                str(pdf_path), xmlContent=True)["content"]

            csv_data = get_additional_data(pdf_path.stem)
            metadata["company"] = csv_data["company"]
            metadata["submitter"] = csv_data["submitter"]
            metadata["application_id"] = csv_data["application_id"]

            with engine_.connect() as conn:
                statement = text(
                    "INSERT INTO pdfs (pdfId, pdfName, pdfSize, filingId, date, totalPages, xmlContent,"
                    "company, submitter, application_id, status) " +
                    "VALUE (:DataID,:pdf_name,:pdf_size,:ParentID,:CreateDate,:total_pages, :xmlContent, "
                    ":company, :submitter, :application_id, '');")
                result = conn.execute(statement, metadata)
            print(
                f"{pdf_path.stem}: successfully inserted {result.rowcount} rows"
            )
        except Exception as e:
            print(f"{pdf_path.stem}: ERROR! {e}")
            traceback.print_tb(e.__traceback__)
        finally:
            return buf.getvalue()
def pdf_to_text(pdf_file_name):
    content = parser.from_file(pdf_file_name)
    return content['content']
Example #53
0
def main(pdfdir, textdir):

    dirlist = [fn for fn in os.listdir(pdfdir) if fn.endswith('.pdf')]

    print 'Extracting text, using Tika, from %d files in %s.' % \
        (len(dirlist), pdfdir)
    print '  Writing output text files to %s.' % textdir

    if not os.path.exists(textdir):
        os.mkdir(textdir)

    widgets = [
        'Files (of %d): ' % len(dirlist),
        Percentage(), ' ',
        Bar('='), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=widgets, maxval=len(dirlist)).start()

    for (i, fn) in enumerate(dirlist):
        pbar.update(i)
        #if int(fn.split('.')[0]) != 1001:
        #    continue
        #print fn
        parsed = parser.from_file(pdfdir + '/' + fn)

        try:
            if parsed['content'] == None:
                print 'Tika found no content in %s.' % fn
                import pdb
                pdb.set_trace()
                continue
        except:
            print 'Tika could not parse %s.' % fn
            continue

        with io.open(textdir + '/' + fn[0:-4] + '.txt', 'w',
                     encoding='utf8') as outf:
            cleaned = parsed['content']

            # Translate some UTF-8 punctuation to ASCII
            punc = {
                0x2018: 0x27,
                0x2019: 0x27,  # single quote
                0x201C: 0x22,
                0x201D: 0x22,  # double quote
                0x2010: 0x2d,
                0x2011: 0x2d,
                0x2012: 0x2d,
                0x2013: 0x2d,  # hyphens
                0xF0B0: 0xb0,  # degree
                0xFF0C: 0x2c,  # comma
                0x00A0: 0x20,  # space
                0x2219: 0x2e,
                0x2022: 0x2e,  # bullets
            }
            # 0x005E:0x5e, 0x02C6:0x5e, 0x0302:0x5e, 0x2038:0x5e, # carets
            # 0x00B0:0x6f, 0x02DA:0x6f, # degree
            # 0x00B9:0x31, 0x00B2:0x32, 0x00B3:0x33, # exponents
            cleaned = cleaned.translate(punc)

            # Replace newlines that separate words with a space (unless hyphen)
            cleaned = re.sub(r'([^\s-])[\r|\n]+([^\s])', '\\1 \\2', cleaned)

            # Remove hyphenation at the end of lines
            # (this is sometimes bad, as with "Fe-\nrich")
            cleaned = cleaned.replace('-\n', '\n')

            # Remove all newlines
            cleaned = re.sub(r'[\r|\n]+', '', cleaned)

            # Remove xxxx.PDF
            cleaned = re.sub(r'([0-9][0-9][0-9][0-9].PDF)',
                             '',
                             cleaned,
                             flags=re.IGNORECASE)
            # And "xx(th|st) Lunar and Planetary Science Conference ((19|20)xx)"
            # with optional parentheses, optional LPI contrib
            cleaned = re.sub(
                r'([0-9][0-9].. Lunar and Planetary Science Conference \(?(19|20)[0-9][0-9]\)?)( \(LPI Contrib. No. [0-9][0-9][0-9][0-9]\))? ?',
                '',
                cleaned,
                flags=re.IGNORECASE)
            # And "Lunar and Planetary Science XXXIII (2002)"
            # with Roman numeral and optional year
            cleaned = re.sub(
                r'(Lunar and Planetary Science [CDILVXM]+( \((19|20)[0-9][0-9]\))?) ?',
                '',
                cleaned,
                flags=re.IGNORECASE)

            # Remove mailto: links
            cleaned = re.sub(r'mailto:[^\s]+', '', cleaned)

            outf.write(cleaned)
            outf.close()
def pdf_parser(path):
    raw = parser.from_file(path)
    return raw['content']
Example #55
0
def extract_information(pdf_path):
    raw_text = parser.from_file(pdf_path)
    raw_list = raw_text['content'].splitlines()
    text = " ".join(raw_list)
    return text
Example #56
0
def convert_to_text(file_name):
    logger.info("Converting file to text")
    parsed = parser.from_file(file_name)
    full_text = parsed["content"]
    return full_text
Example #57
0
    def text_extract(self):

        parsed_file = parser.from_file(self.file_path)
        self.meta_data = parsed_file['metadata']
        parsed_content = parsed_file['content']
        return parsed_content
Example #58
0
pdfWriter = PyPDF2.PdfFileWriter()

for files_address in pdf_files:
    pdfFileObj = open(files_address, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

    for pageNum in range(0, pdfReader.numPages):
        pageObj = pdfReader.getPage(pageNum)

        pdfWriter.addPage(pageObj)

pdfOutput = open('merged.pdf', 'wb')
pdfWriter.write(pdfOutput)
pdfOutput.close()

raw = parser.from_file("/Users/docha/PycharmProjects/Tools_for_buh/merged.pdf")
raw = raw['content']

special_char_map = {
    ord('ä'): 'a',
    ord('ü'): 'u',
    ord('ö'): 'o',
    ord('õ'): 'o',
    ord('ž'): 'z',
    ord('š'): 's',
    ord('Ä'): 'A',
    ord('Ü'): 'U',
    ord('Ö'): 'O',
    ord('Õ'): 'O',
    ord('Z'): 'Z',
    ord('Š'): 's'
import re
import pandas as pd
from tika import parser

raw = parser.from_file('KW18Abstracts.pdf')
raw_text = raw['content'][3580:12120360]


def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]


# split by regex for abstract names
s = r'((TH|FR|SA)?-?(OR|PO|PUB)\d{3,4})\s'
split_abstracts = re.split(s, raw_text)

# Above regex splits into 4 sections by matching groups
#   poster_id
#   day
#   poster_type
#   poster_content

abstract_list = [sections for sections in chunks(split_abstracts[1:], 4)]
abstract_df = pd.DataFrame(
    abstract_list,
    columns=['poster_id', 'day', 'poster_type', 'poster_content'])


def section_split(text):
Example #60
0
import os
from tika import parser

savePath = "txt"

if not os.path.exists(savePath):
    os.mkdir(savePath)

folder_name = "arxiv"
file_list = [name for name in os.listdir(folder_name)]
file_list = [folder_name + '/' + str(name) for name in file_list]
i = 1
for file in file_list:
    raw = parser.from_file(file)
    text = raw['content']
    text = str(text)
    l1 = text.split('\n')
    l2 = []
    for line in l1:
        if len(line) > 10:
            l2.append(line)
    dpath = os.path.join(savePath, str(i) + ".txt")
    with open(dpath, "w", encoding='utf-8') as code:
        for line in l2:
            code.write(str(line))
            code.write('\n')
    i += 1