Beispiel #1
0
	def __init__(self, fileName):
		parsed = parser.from_file(fileName)
		metadata = parsed["metadata"]
		#   Return re.sub('[\s+]', '', content)
		#  TODO: Delete... Very Redundant..
		content = parsed["content"]
		content = content.replace('\n', '')
		content = content.replace('\t', '')
		content = content.replace('\'', '')
		content = content.replace('\"', '')
		rx = re.compile('\W+')
		content = rx.sub(' ', content).strip()
		self.content = content
		#   Title...
		try:
			title = metadata['title']
		except:
		    title = 'Untitled'
		title = title.replace('\t', '')
		title = title.replace('\t', '')
		title = title.replace('\'', '')
		title = title.replace('\"', '')
		title = rx.sub(' ', title).strip()
		self.title = title
		#  self.type = self.metadata['Content-Type-Hint']
		#  self.name = self.metadata['resourceName']
		#  lanFix = re.sub('[\s+]', '', content)
		self.lang = language.from_file(fileName)
  def runNER(self):
    f = open(self.tpath, "w+")
    f.write(TikaWrapper(self.path).getContent().encode('UTF-8'))
    f.close()

    extracted = TikaWrapper(self.tpath).getInterstingRegions()
    f = open(self.tpath, "w+")
    f.write(extracted.encode('UTF-8'))
    f.close()

    evaL = NEREvaluator( TikaWrapper(self.tpath).runNER() )

    self.metadata['language'] = language.from_file(self.tpath)

    self.metadata['ner'] = {
      'opennlp': evaL.opennlp(),
      'corenlp': evaL.corenlp(),
      'nltk'   : evaL.nltk(),
      'overlap': evaL.overlap(),
    }

    try:
      self.metadata['measurements'] = evaL.measurements()
    except:
      self.metadata['measurements'] = [ ]
Beispiel #3
0
 def __init__(self, fileName):
     parsed = parser.from_file(fileName)
     metadata = parsed["metadata"]
     #   Return re.sub('[\s+]', '', content)
     #  TODO: Delete... Very Redundant..
     content = parsed["content"]
     content = content.replace('\n', '')
     content = content.replace('\t', '')
     content = content.replace('\'', '')
     content = content.replace('\"', '')
     rx = re.compile('\W+')
     content = rx.sub(' ', content).strip()
     self.content = content
     #   Title...
     try:
         title = metadata['title']
     except:
         title = 'Untitled'
     title = title.replace('\t', '')
     title = title.replace('\t', '')
     title = title.replace('\'', '')
     title = title.replace('\"', '')
     title = rx.sub(' ', title).strip()
     self.title = title
     #  self.type = self.metadata['Content-Type-Hint']
     #  self.name = self.metadata['resourceName']
     #  lanFix = re.sub('[\s+]', '', content)
     self.lang = language.from_file(fileName)
def load_topics(filename):
	languages.append(language.from_file(filename))
	parser_obj = parser.from_file(filename)
	if 'content' in parser_obj and parser_obj['content']:
		words.extend(get_nouns(parser_obj['content']))
	if 'metadata' in parser_obj:
		metadata_dict = parser_obj['metadata']
		if 'Author' in metadata_dict:
			if type(metadata_dict['Author']) == type([]):
				metadata.append(metadata_dict['Author'][0])
			else:	
				metadata.append(metadata_dict['Author'])

		if 'xmp:CreatorTool' in metadata_dict:
			if type(metadata_dict['xmp:CreatorTool']) == type([]):
				metadata.extend(metadata_dict['xmp:CreatorTool'])
			else:	
				metadata.append(metadata_dict['xmp:CreatorTool'])

		if 'Content-Type' in metadata_dict:
			if type(metadata_dict['Content-Type']) == type([]):
				metadata.append(metadata_dict['Content-Type'][0])
			else:
				metadata.append(metadata_dict['Content-Type'])
		if 'Company' in metadata_dict:
			if type(metadata_dict['Company']) == type([]):
				metadata.append(metadata_dict['Company'][0])
			else:
				metadata.append(metadata_dict['Company'])
 def loadMD(self):
   md = TikaWrapper(self.path).getMetadata()
   self.metadata = {
     'id': self.id,
     'content-type': md['Content-Type'],
     'tika-metadata': md,
     'size': getsize(self.path),
     'language': language.from_file(self.path),
     'crawl': self.requestData
   }
def main():
    start_time = datetime.now()
    # Read the data from the following path
    data_files = '/Users/Antrromet/Documents/USC/Spring2016/CDA_CSCI599/Assignment_2/data/'

    dashboard_data = []
    # Write the response in the following file
    i = 0
    for path, dirs, files in os.walk(data_files):
        dirs.sort()
        path_spl = path.split('/')
        content_type = path_spl[len(path_spl) - 1].replace('_', '/')

        for f in sorted(files):
            if f not in '.DS_Store':
                if i >= 0:
                    i += 1
                    print(str(i) + '. ' + content_type + ' - ' + str(f))
                    lang = language.from_file(path + '/' + f)
                    added_lan = False
                    found_content = False
                    for item in dashboard_data:
                        if item['ContentType'] == content_type:
                            found_content = True
                            for lan in item['Languages']:
                                if lan == lang:
                                    added_lan = True
                                    val = item['Languages'][lang]
                                    item['Languages'][lang] = val + 1
                                    break
                            if not added_lan:
                                item['Languages'][lang] = 1
                    if not found_content:
                        dashboard_data.append({'ContentType': content_type, 'Languages': {lang: 1}})
                else:
                    i += 1

                if i % 1000 == 0:
                    print 'Parsed ' + str(i) + ' files'

    print json.dumps(dashboard_data, indent=4)
    end_time = datetime.now()
    print(end_time - start_time)
    output_file = open('language_diversity.data', 'w+')
    json.dump(dashboard_data, output_file)
						try:
							fjson["languages"] = {}
							languages = detect_langs(f_text)
							for l in languages:
								(lang,probability) = str(l).split(":")
								fjson["languages"][lang] = probability
						except:
							print("\n Language Detection module exncountered error")	
						#print(" Languages Detected {l}".format(l=languages))
						#pp.pprint(fjson["languages"])
					except (KeyError,ValueError):
						print("Tika could not get content for {f}".format(f=fpath))
						fjson["languages"] = " "
					fhandle.close()
					fjson["id"] = fname
					fjson["size"] = os.path.getsize(fpath)
					#print("Size of file : "+str(fjson["size"]))
				except ValueError:
					print("Tika could not get content for {f}".format(f=fpath))
				try:
					fjson["tika_language"] = language.from_file(fpath)
					#print(" Languages Detected by Tika {l}".format(l=fjson["tika_language"]))
				except UnicodeDecodeError:
					fjson["tika_language"] = " "
					print("Tika encountered problem reading the text for identifying Languages! Skipping")
				mime_json[dirName].append(fjson)

	filename = "lang_jsons//"+dirName+"_lang.json"
	with open(filename,"w") as ohandle:
		json.dump(mime_json,ohandle)
		ohandle.close()
Beispiel #8
0
arg1=str(sys.argv[1])
language_map=defaultdict(list)
count=0

def printMap(tag,filename):
    if os.path.exists(filename):
        os.remove(filename)
    with open(filename,'a+') as fopen:
        json.dump(tag,fopen)
count=0
for root, dirs, files in os.walk(arg1):
    for file in files:
        count+=1
        print count
        path=''
        lang=''
        if(file!='.DS_Store'):
            count+=1
            print count
            path=os.path.join(root, file)
            tika.initVM()
            try:
                lang=language.from_file(path)
            except:
                lang='unknown'
            print lang
            language_map[lang].append(file)

printMap(language_map,"language.json")

    def imageparser(self, response):
        pmspiderItem = pmScrapeItem()
        temp = (response.url).split('file://')[1]
        pdf_file = basename(response.url)
        pmspiderItem['pm_page_one']=((response.body).split('Page:1')[0]).decode('utf-8').replace("\n", "")
        pm_page_one=((response.body).split('Page:1')[0]).replace("\n", "")
        replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
        pm_page_one = pm_page_one.translate(replace_punctuation)
        pm_page_one = re.sub(' +',' ',pm_page_one.lower())
        pm_page_one.replace('somatropin','somatotropin')
        f = open('pmpageone.txt','w')
        f.write(pm_page_one)
        f.close()
        
        pmspiderItem['content']=response.body
        content = response.body
        pmspiderItem['file_type']='PDF'
        pmspiderItem['pm_number']=splitext(basename(response.url))[0].decode('utf-8')
        pm_number = splitext(basename(response.url))[0].decode('utf-8')
        pmspiderItem['id']=pm_number
        pmspiderItem['file_path']='https://pdf.hres.ca/dpd_pm/%s.PDF'%pm_number
        file_path='https://pdf.hres.ca/dpd_pm/%s.PDF'%pm_number
        pmspiderItem['file_name']=''.join(splitext(basename(file_path))).decode('utf-8')
        pmspiderItem['date_scraped']=datetime.datetime.now()
        pmspiderItem['server']=socket.gethostname()
        pmspiderItem['project']=self.settings.get('BOT_NAME')
        pmspiderItem['spider']=self.name
        pmspiderItem['content_length']=len(response.body)
        
        f = open('/home/hjiang/pmscrapy/pdf_text/pdftext.txt','w')
        f.write(content)
        lang = language.from_file('/home/hjiang/pmscrapy/pdf_text/pdftext.txt')
        pmspiderItem['language'] = lang
        f.close()

        # pt_term_index = []
        # pt_term_index=findItem(ptpm_list,pm_number)
        # if pt_term_index == []:
        #     pmspiderItem['pt_term'] = u'NA'
        #     pmspiderItem['pt_term_index'] = u'NA'
        # else:
        #     pmspiderItem['pt_term'] = ptpm_list[pt_term_index[0][0]][1].decode("utf-8")
        #     pmspiderItem['pt_term_index'] = str(pt_term_index[0][0]).decode("utf-8")


        count = 0
        for k in range(len(name_list)):
            if count >= 1:
                break
            text = name_list[k].translate(replace_punctuation)
            ele_list = text.split(' ')
            if len(ele_list) <= 4:
                ele_list = list(itertools.permutations(ele_list))
            else:
                ele_list = [' '.join(ele_list)]
            for i in range(len(ele_list)):
                ele_list[i] = ' '.join(ele_list[i])
                if ele_list[i].lower() in pm_page_one.lower():
                    content_index = k + 1
                    pmspiderItem['atc_code']=content_list[content_index][0]
                    pmspiderItem['synonyms']=content_list[content_index][1]
                    pmspiderItem['categories']=content_list[content_index][3]
                    pmspiderItem['dosages']=content_list[content_index][4]
                    pmspiderItem['matchiterm'] = name_list[k]
                    count = count + 1
                    break
            # if count == 0:
            #     for synonyms in synonyms_list[k]:
            #         if synonyms == '':
            #             break
            #         elif synonyms.lower() in pm_page_one.lower():
            #             content_index = k + 1
            #             pmspiderItem['atc_code']=content_list[content_index][0]
            #             pmspiderItem['synonyms']=content_list[content_index][1]
            #             pmspiderItem['categories']=content_list[content_index][3]
            #             pmspiderItem['dosages']=content_list[content_index][4]
            #             pmspiderItem['matchiterm'] = name_list[k]
            #             count = count + 1
            #             break
        if count == 0:
            pmspiderItem['atc_code']= u'NA'
            pmspiderItem['synonyms']= u'NA'
            pmspiderItem['categories']= u'NA'
            pmspiderItem['dosages']= u'NA'
            pmspiderItem['matchiterm'] = u'NA'
        os.remove(temp) 
        return pmspiderItem
 def txtparser(self, response):
     pmspiderItem = pmScrapeItem()
     pdf_file = basename(response.url)
     
     ### clean pm_page_one ###
     pmspiderItem['pm_page_one'] = ' '.join(convert(pdf_file, pages=[0]).split())
     pm_page_one=' '.join(convert(pdf_file, pages=[0]).split()).encode('utf-8').lower()
     pm_page_one = pm_page_one.replace('classification','')
     replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
     pm_page_one = pm_page_one.translate(replace_punctuation)
     pm_page_one = re.sub(' +',' ',pm_page_one.lower())
     ### typo correction ###
     # pm_page_one = pm_page_one.replace('somatropin','somatotropin')
     pm_page_one = pm_page_one.replace('p r o d u c t m o n o g r a p h','')
     pm_page_one = pm_page_one.replace('product monograph','')
     f = open('pmpageone.txt','w')
     f.write(pm_page_one)
     f.close()
     
     pmspiderItem['content'] = ' '.join(convert(pdf_file).split())
     content = ' '.join(convert(pdf_file).split())
     f = open('/home/hjiang/pmscrapy/pdf_text/pdftext.txt','w')
     f.write(content)
     lang = language.from_file('/home/hjiang/pmscrapy/pdf_text/pdftext.txt')
     pmspiderItem['language'] = lang
     f.close()
     temp = (response.url).split('file://')[1]
     fp = open(temp)
     parser = PDFParser(fp)
     doc = PDFDocument(parser)
     parser.set_document(doc)
     rsrcmgr = PDFResourceManager()
     laparams = LAParams()
     device = PDFPageAggregator(rsrcmgr, laparams=laparams)
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     pages = layout_scanner.get_pages(response.url)
     
     pmspiderItem['file_type']='PDF'
     pmspiderItem['pm_number']=splitext(basename(response.url))[0].decode('utf-8')
     pm_number = splitext(basename(response.url))[0].decode('utf-8')
     pmspiderItem['file_path']='https://pdf.hres.ca/dpd_pm/%s.PDF'%pm_number
     file_path='https://pdf.hres.ca/dpd_pm/%s.PDF'%pm_number
     pmspiderItem['file_name']=basename(file_path).decode('utf-8')
     pmspiderItem['date_scraped']=datetime.datetime.now()
     pmspiderItem['server']=socket.gethostname()
     pmspiderItem['project']=self.settings.get('BOT_NAME')
     pmspiderItem['spider']=self.name
     pmspiderItem['content_length']=len(content)
   
     
     # pt_term_index = []
     # pt_term_index=findItem(ptpm_list,pm_number)
     # if pt_term_index == []:
     #     pmspiderItem['pt_term'] = u'NA'
     #     pmspiderItem['pt_term_index'] = u'NA'
     # else:
     #     pmspiderItem['pt_term'] = ptpm_list[pt_term_index[0][0]][1].decode("utf-8")
     #     pmspiderItem['pt_term_index'] = str(pt_term_index[0][0]).decode("utf-8")
     
     count = 0
     for k in range(len(name_list)):
         if count >= 1:
             break
         text = name_list[k].translate(replace_punctuation)
         ele_list = text.split(' ')
         if len(ele_list) <= 4:
             ele_list = list(itertools.permutations(ele_list))
         else:
             ele_list = [' '.join(ele_list)]
         for i in range(len(ele_list)):
             ele_list[i] = ' '.join(ele_list[i])
             if ele_list[i].lower() in pm_page_one.lower():
                 content_index = k + 1
                 pmspiderItem['atc_code']=content_list[content_index][0]
                 pmspiderItem['synonyms']=content_list[content_index][1]
                 pmspiderItem['categories']=content_list[content_index][3]
                 pmspiderItem['dosages']=content_list[content_index][4]
                 pmspiderItem['matchiterm'] = name_list[k]
                 count = count + 1
                 print('yes')
                 break
         # if count == 0:
         #     if synonyms_list[k] == '':
         #         print('empty list')
         #         break
         #     else:
         #         for synonyms in synonyms_list[k]:
         #             if synonyms == '':
         #                 print('missing value')
         #                 break
         #             if synonyms.lower() in pm_page_one.lower():
         #                 print("This is synonyms blablabla:%s"%synonyms)
         #                 content_index = k + 1
         #                 pmspiderItem['atc_code']=content_list[content_index][0]
         #                 pmspiderItem['synonyms']=content_list[content_index][1]
         #                 pmspiderItem['categories']=content_list[content_index][3]
         #                 pmspiderItem['dosages']=content_list[content_index][4]
         #                 pmspiderItem['matchiterm'] = synonyms
         #                 count = count + 1
         #                 print('yes1')
         #                 break
     if count == 0:
         pmspiderItem['atc_code']= u'NA'
         pmspiderItem['synonyms']= u'NA'
         pmspiderItem['categories']= u'NA'
         pmspiderItem['dosages']= u'NA'
         pmspiderItem['matchiterm'] = u'NA'
         print('no')
     os.remove(temp) 
     return pmspiderItem
import os
from collections import OrderedDict
from tika import parser
from tika import language
import random
path = "/Users/charanshampur/newAwsDump/testFiles4"
d3LanguageDist=open("D3Language.json","w")
langFile = open("Language.json","r")
langDictionary=json.load(langFile)
Language={}
for path,dirs,files in os.walk(path):
    for file in files:
        if file not in ".DS_Store":
            path_to_file = path+"/"+str(file)
            print path_to_file
            lang = language.from_file(path_to_file)
            if lang not in Language:
                Language[lang]=1
            else:
                Language[lang]+=1

contentList=[]
for k,v in Language.items():
    content=OrderedDict()
    content["label"] = langDictionary[k]["name"]
    content["value"] = int(v)
    content["color"] = "#%06x" % random.randint(0, 0xFFFFFF)
    contentList.append(OrderedDict(content))


json.dump(contentList,d3LanguageDist,indent=4)
Beispiel #12
0
def test_language():
    """test_language"""
    from tika import language
    print_stars()
    print(language.from_file(doc))
Beispiel #13
0
print("--------------Metadados e o Conteudo dos arquivos--------------")
for i in range(0, 5):
    print("--------------------------------------------------------------")
    print("Nome arquivo: " + arquivos[i])
    parsed = parser.from_file(path_pasta_arquivos +
                              arquivos[i])  #Faz um parse do arquivo
    metadata = parsed["metadata"]
    print(json.dumps(metadata, indent=4))  #Imprime em um formato melhor

    print(parsed["content"])  #Imprime o conteudo do arquivo
    print("--------------------------------------------------------------")
    print("\n\n\n")

print("--------------Idioma do arquivo--------------")
print("O idioma do texto eh: ", language.from_file(path_pasta_arquivos + arq7),
      '\n\n')  #Detecta o idioma do arquivo

print("--------------Traducao arquivo--------------")
print(translate.from_file(path_pasta_arquivos + arq7, 'en',
                          'es'))  #Faz uma traducao do idioma de origem

print("\n\n")

print("--------------Classificacao dos arquivos--------------"
      )  #Tipos dos arquivos MIME
for arquivo in arquivos:
    print("Nome arquivo: %s \tTipo: %s" %
          (arquivo, detector.from_file(path_pasta_arquivos + arquivo)))

print("\n\n")
Beispiel #14
0
                last_read_file.write(filename)

            # Update metrics and save to disk
            metrics_dict['total'] += 1
            metrics_dict['failed'] += 1
            with open(metrics_dict_file_path, 'wb') as picklefile:
                pickle.dump(metrics_dict, picklefile)

            continue

        # Write the extracted text to the destination
        with io.open(extracted_file_path, "w", encoding="utf-8") as f:
            f.write(parsedFile['content'])

        # Invoke tika to detect language of the text file
        lang = language.from_file(extracted_file_path)

        # Remove non-English text files
        if lang != "en":
            os.remove(extracted_file_path)
            os.remove(download_file_path)
            print("Non-English text file removed.\n")

            # Update metrics and save to disk
            metrics_dict['total'] += 1
            metrics_dict['nonEng'] += 1
            with open(metrics_dict_file_path, 'wb') as picklefile:
                pickle.dump(metrics_dict, picklefile)

        else:
            # Serialize metadata and dump in pickle file
def get_file_lang(file_path):
    result = language.from_file(file_path)
    lang = lang_keys["name."+result]
    return lang