def __init__(self, fileName): parsed = parser.from_file(fileName) metadata = parsed["metadata"] # Return re.sub('[\s+]', '', content) # TODO: Delete... Very Redundant.. content = parsed["content"] content = content.replace('\n', '') content = content.replace('\t', '') content = content.replace('\'', '') content = content.replace('\"', '') rx = re.compile('\W+') content = rx.sub(' ', content).strip() self.content = content # Title... try: title = metadata['title'] except: title = 'Untitled' title = title.replace('\t', '') title = title.replace('\t', '') title = title.replace('\'', '') title = title.replace('\"', '') title = rx.sub(' ', title).strip() self.title = title # self.type = self.metadata['Content-Type-Hint'] # self.name = self.metadata['resourceName'] # lanFix = re.sub('[\s+]', '', content) self.lang = language.from_file(fileName)
def runNER(self): f = open(self.tpath, "w+") f.write(TikaWrapper(self.path).getContent().encode('UTF-8')) f.close() extracted = TikaWrapper(self.tpath).getInterstingRegions() f = open(self.tpath, "w+") f.write(extracted.encode('UTF-8')) f.close() evaL = NEREvaluator( TikaWrapper(self.tpath).runNER() ) self.metadata['language'] = language.from_file(self.tpath) self.metadata['ner'] = { 'opennlp': evaL.opennlp(), 'corenlp': evaL.corenlp(), 'nltk' : evaL.nltk(), 'overlap': evaL.overlap(), } try: self.metadata['measurements'] = evaL.measurements() except: self.metadata['measurements'] = [ ]
def load_topics(filename): languages.append(language.from_file(filename)) parser_obj = parser.from_file(filename) if 'content' in parser_obj and parser_obj['content']: words.extend(get_nouns(parser_obj['content'])) if 'metadata' in parser_obj: metadata_dict = parser_obj['metadata'] if 'Author' in metadata_dict: if type(metadata_dict['Author']) == type([]): metadata.append(metadata_dict['Author'][0]) else: metadata.append(metadata_dict['Author']) if 'xmp:CreatorTool' in metadata_dict: if type(metadata_dict['xmp:CreatorTool']) == type([]): metadata.extend(metadata_dict['xmp:CreatorTool']) else: metadata.append(metadata_dict['xmp:CreatorTool']) if 'Content-Type' in metadata_dict: if type(metadata_dict['Content-Type']) == type([]): metadata.append(metadata_dict['Content-Type'][0]) else: metadata.append(metadata_dict['Content-Type']) if 'Company' in metadata_dict: if type(metadata_dict['Company']) == type([]): metadata.append(metadata_dict['Company'][0]) else: metadata.append(metadata_dict['Company'])
def loadMD(self): md = TikaWrapper(self.path).getMetadata() self.metadata = { 'id': self.id, 'content-type': md['Content-Type'], 'tika-metadata': md, 'size': getsize(self.path), 'language': language.from_file(self.path), 'crawl': self.requestData }
def main(): start_time = datetime.now() # Read the data from the following path data_files = '/Users/Antrromet/Documents/USC/Spring2016/CDA_CSCI599/Assignment_2/data/' dashboard_data = [] # Write the response in the following file i = 0 for path, dirs, files in os.walk(data_files): dirs.sort() path_spl = path.split('/') content_type = path_spl[len(path_spl) - 1].replace('_', '/') for f in sorted(files): if f not in '.DS_Store': if i >= 0: i += 1 print(str(i) + '. ' + content_type + ' - ' + str(f)) lang = language.from_file(path + '/' + f) added_lan = False found_content = False for item in dashboard_data: if item['ContentType'] == content_type: found_content = True for lan in item['Languages']: if lan == lang: added_lan = True val = item['Languages'][lang] item['Languages'][lang] = val + 1 break if not added_lan: item['Languages'][lang] = 1 if not found_content: dashboard_data.append({'ContentType': content_type, 'Languages': {lang: 1}}) else: i += 1 if i % 1000 == 0: print 'Parsed ' + str(i) + ' files' print json.dumps(dashboard_data, indent=4) end_time = datetime.now() print(end_time - start_time) output_file = open('language_diversity.data', 'w+') json.dump(dashboard_data, output_file)
try: fjson["languages"] = {} languages = detect_langs(f_text) for l in languages: (lang,probability) = str(l).split(":") fjson["languages"][lang] = probability except: print("\n Language Detection module exncountered error") #print(" Languages Detected {l}".format(l=languages)) #pp.pprint(fjson["languages"]) except (KeyError,ValueError): print("Tika could not get content for {f}".format(f=fpath)) fjson["languages"] = " " fhandle.close() fjson["id"] = fname fjson["size"] = os.path.getsize(fpath) #print("Size of file : "+str(fjson["size"])) except ValueError: print("Tika could not get content for {f}".format(f=fpath)) try: fjson["tika_language"] = language.from_file(fpath) #print(" Languages Detected by Tika {l}".format(l=fjson["tika_language"])) except UnicodeDecodeError: fjson["tika_language"] = " " print("Tika encountered problem reading the text for identifying Languages! Skipping") mime_json[dirName].append(fjson) filename = "lang_jsons//"+dirName+"_lang.json" with open(filename,"w") as ohandle: json.dump(mime_json,ohandle) ohandle.close()
arg1=str(sys.argv[1]) language_map=defaultdict(list) count=0 def printMap(tag,filename): if os.path.exists(filename): os.remove(filename) with open(filename,'a+') as fopen: json.dump(tag,fopen) count=0 for root, dirs, files in os.walk(arg1): for file in files: count+=1 print count path='' lang='' if(file!='.DS_Store'): count+=1 print count path=os.path.join(root, file) tika.initVM() try: lang=language.from_file(path) except: lang='unknown' print lang language_map[lang].append(file) printMap(language_map,"language.json")
def imageparser(self, response): pmspiderItem = pmScrapeItem() temp = (response.url).split('file://')[1] pdf_file = basename(response.url) pmspiderItem['pm_page_one']=((response.body).split('Page:1')[0]).decode('utf-8').replace("\n", "") pm_page_one=((response.body).split('Page:1')[0]).replace("\n", "") replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation)) pm_page_one = pm_page_one.translate(replace_punctuation) pm_page_one = re.sub(' +',' ',pm_page_one.lower()) pm_page_one.replace('somatropin','somatotropin') f = open('pmpageone.txt','w') f.write(pm_page_one) f.close() pmspiderItem['content']=response.body content = response.body pmspiderItem['file_type']='PDF' pmspiderItem['pm_number']=splitext(basename(response.url))[0].decode('utf-8') pm_number = splitext(basename(response.url))[0].decode('utf-8') pmspiderItem['id']=pm_number pmspiderItem['file_path']='https://pdf.hres.ca/dpd_pm/%s.PDF'%pm_number file_path='https://pdf.hres.ca/dpd_pm/%s.PDF'%pm_number pmspiderItem['file_name']=''.join(splitext(basename(file_path))).decode('utf-8') pmspiderItem['date_scraped']=datetime.datetime.now() pmspiderItem['server']=socket.gethostname() pmspiderItem['project']=self.settings.get('BOT_NAME') pmspiderItem['spider']=self.name pmspiderItem['content_length']=len(response.body) f = open('/home/hjiang/pmscrapy/pdf_text/pdftext.txt','w') f.write(content) lang = language.from_file('/home/hjiang/pmscrapy/pdf_text/pdftext.txt') pmspiderItem['language'] = lang f.close() # pt_term_index = [] # pt_term_index=findItem(ptpm_list,pm_number) # if pt_term_index == []: # pmspiderItem['pt_term'] = u'NA' # pmspiderItem['pt_term_index'] = u'NA' # else: # pmspiderItem['pt_term'] = ptpm_list[pt_term_index[0][0]][1].decode("utf-8") # pmspiderItem['pt_term_index'] = str(pt_term_index[0][0]).decode("utf-8") count = 0 for k in range(len(name_list)): if count >= 1: break text = name_list[k].translate(replace_punctuation) ele_list = text.split(' ') if len(ele_list) <= 4: ele_list = list(itertools.permutations(ele_list)) else: ele_list = [' '.join(ele_list)] for i in range(len(ele_list)): ele_list[i] = ' '.join(ele_list[i]) if ele_list[i].lower() in pm_page_one.lower(): content_index = k + 1 pmspiderItem['atc_code']=content_list[content_index][0] pmspiderItem['synonyms']=content_list[content_index][1] pmspiderItem['categories']=content_list[content_index][3] pmspiderItem['dosages']=content_list[content_index][4] pmspiderItem['matchiterm'] = name_list[k] count = count + 1 break # if count == 0: # for synonyms in synonyms_list[k]: # if synonyms == '': # break # elif synonyms.lower() in pm_page_one.lower(): # content_index = k + 1 # pmspiderItem['atc_code']=content_list[content_index][0] # pmspiderItem['synonyms']=content_list[content_index][1] # pmspiderItem['categories']=content_list[content_index][3] # pmspiderItem['dosages']=content_list[content_index][4] # pmspiderItem['matchiterm'] = name_list[k] # count = count + 1 # break if count == 0: pmspiderItem['atc_code']= u'NA' pmspiderItem['synonyms']= u'NA' pmspiderItem['categories']= u'NA' pmspiderItem['dosages']= u'NA' pmspiderItem['matchiterm'] = u'NA' os.remove(temp) return pmspiderItem
def txtparser(self, response): pmspiderItem = pmScrapeItem() pdf_file = basename(response.url) ### clean pm_page_one ### pmspiderItem['pm_page_one'] = ' '.join(convert(pdf_file, pages=[0]).split()) pm_page_one=' '.join(convert(pdf_file, pages=[0]).split()).encode('utf-8').lower() pm_page_one = pm_page_one.replace('classification','') replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation)) pm_page_one = pm_page_one.translate(replace_punctuation) pm_page_one = re.sub(' +',' ',pm_page_one.lower()) ### typo correction ### # pm_page_one = pm_page_one.replace('somatropin','somatotropin') pm_page_one = pm_page_one.replace('p r o d u c t m o n o g r a p h','') pm_page_one = pm_page_one.replace('product monograph','') f = open('pmpageone.txt','w') f.write(pm_page_one) f.close() pmspiderItem['content'] = ' '.join(convert(pdf_file).split()) content = ' '.join(convert(pdf_file).split()) f = open('/home/hjiang/pmscrapy/pdf_text/pdftext.txt','w') f.write(content) lang = language.from_file('/home/hjiang/pmscrapy/pdf_text/pdftext.txt') pmspiderItem['language'] = lang f.close() temp = (response.url).split('file://')[1] fp = open(temp) parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = layout_scanner.get_pages(response.url) pmspiderItem['file_type']='PDF' pmspiderItem['pm_number']=splitext(basename(response.url))[0].decode('utf-8') pm_number = splitext(basename(response.url))[0].decode('utf-8') pmspiderItem['file_path']='https://pdf.hres.ca/dpd_pm/%s.PDF'%pm_number file_path='https://pdf.hres.ca/dpd_pm/%s.PDF'%pm_number pmspiderItem['file_name']=basename(file_path).decode('utf-8') pmspiderItem['date_scraped']=datetime.datetime.now() pmspiderItem['server']=socket.gethostname() pmspiderItem['project']=self.settings.get('BOT_NAME') pmspiderItem['spider']=self.name pmspiderItem['content_length']=len(content) # pt_term_index = [] # pt_term_index=findItem(ptpm_list,pm_number) # if pt_term_index == []: # pmspiderItem['pt_term'] = u'NA' # pmspiderItem['pt_term_index'] = u'NA' # else: # pmspiderItem['pt_term'] = ptpm_list[pt_term_index[0][0]][1].decode("utf-8") # pmspiderItem['pt_term_index'] = str(pt_term_index[0][0]).decode("utf-8") count = 0 for k in range(len(name_list)): if count >= 1: break text = name_list[k].translate(replace_punctuation) ele_list = text.split(' ') if len(ele_list) <= 4: ele_list = list(itertools.permutations(ele_list)) else: ele_list = [' '.join(ele_list)] for i in range(len(ele_list)): ele_list[i] = ' '.join(ele_list[i]) if ele_list[i].lower() in pm_page_one.lower(): content_index = k + 1 pmspiderItem['atc_code']=content_list[content_index][0] pmspiderItem['synonyms']=content_list[content_index][1] pmspiderItem['categories']=content_list[content_index][3] pmspiderItem['dosages']=content_list[content_index][4] pmspiderItem['matchiterm'] = name_list[k] count = count + 1 print('yes') break # if count == 0: # if synonyms_list[k] == '': # print('empty list') # break # else: # for synonyms in synonyms_list[k]: # if synonyms == '': # print('missing value') # break # if synonyms.lower() in pm_page_one.lower(): # print("This is synonyms blablabla:%s"%synonyms) # content_index = k + 1 # pmspiderItem['atc_code']=content_list[content_index][0] # pmspiderItem['synonyms']=content_list[content_index][1] # pmspiderItem['categories']=content_list[content_index][3] # pmspiderItem['dosages']=content_list[content_index][4] # pmspiderItem['matchiterm'] = synonyms # count = count + 1 # print('yes1') # break if count == 0: pmspiderItem['atc_code']= u'NA' pmspiderItem['synonyms']= u'NA' pmspiderItem['categories']= u'NA' pmspiderItem['dosages']= u'NA' pmspiderItem['matchiterm'] = u'NA' print('no') os.remove(temp) return pmspiderItem
import os from collections import OrderedDict from tika import parser from tika import language import random path = "/Users/charanshampur/newAwsDump/testFiles4" d3LanguageDist=open("D3Language.json","w") langFile = open("Language.json","r") langDictionary=json.load(langFile) Language={} for path,dirs,files in os.walk(path): for file in files: if file not in ".DS_Store": path_to_file = path+"/"+str(file) print path_to_file lang = language.from_file(path_to_file) if lang not in Language: Language[lang]=1 else: Language[lang]+=1 contentList=[] for k,v in Language.items(): content=OrderedDict() content["label"] = langDictionary[k]["name"] content["value"] = int(v) content["color"] = "#%06x" % random.randint(0, 0xFFFFFF) contentList.append(OrderedDict(content)) json.dump(contentList,d3LanguageDist,indent=4)
def test_language(): """test_language""" from tika import language print_stars() print(language.from_file(doc))
print("--------------Metadados e o Conteudo dos arquivos--------------") for i in range(0, 5): print("--------------------------------------------------------------") print("Nome arquivo: " + arquivos[i]) parsed = parser.from_file(path_pasta_arquivos + arquivos[i]) #Faz um parse do arquivo metadata = parsed["metadata"] print(json.dumps(metadata, indent=4)) #Imprime em um formato melhor print(parsed["content"]) #Imprime o conteudo do arquivo print("--------------------------------------------------------------") print("\n\n\n") print("--------------Idioma do arquivo--------------") print("O idioma do texto eh: ", language.from_file(path_pasta_arquivos + arq7), '\n\n') #Detecta o idioma do arquivo print("--------------Traducao arquivo--------------") print(translate.from_file(path_pasta_arquivos + arq7, 'en', 'es')) #Faz uma traducao do idioma de origem print("\n\n") print("--------------Classificacao dos arquivos--------------" ) #Tipos dos arquivos MIME for arquivo in arquivos: print("Nome arquivo: %s \tTipo: %s" % (arquivo, detector.from_file(path_pasta_arquivos + arquivo))) print("\n\n")
last_read_file.write(filename) # Update metrics and save to disk metrics_dict['total'] += 1 metrics_dict['failed'] += 1 with open(metrics_dict_file_path, 'wb') as picklefile: pickle.dump(metrics_dict, picklefile) continue # Write the extracted text to the destination with io.open(extracted_file_path, "w", encoding="utf-8") as f: f.write(parsedFile['content']) # Invoke tika to detect language of the text file lang = language.from_file(extracted_file_path) # Remove non-English text files if lang != "en": os.remove(extracted_file_path) os.remove(download_file_path) print("Non-English text file removed.\n") # Update metrics and save to disk metrics_dict['total'] += 1 metrics_dict['nonEng'] += 1 with open(metrics_dict_file_path, 'wb') as picklefile: pickle.dump(metrics_dict, picklefile) else: # Serialize metadata and dump in pickle file
def get_file_lang(file_path): result = language.from_file(file_path) lang = lang_keys["name."+result] return lang