def detect_files(output_name): index_file = '/Users/Frank/PycharmProjects/599assignment1/geo-topic-parser-folder/geo-topic-all-files.txt' base_directory = '/Users/Frank/Desktop/fulldump/raw-dataset/' output_dir = os.path.dirname(output_name) if not os.path.exists(output_dir): os.makedirs(output_dir) with open(output_name, 'w') as output_file: file_list = yaoner.read_index_file(index_file, base_directory) # file_type_map = dict() result_list = list() for idx, val in enumerate(file_list): file_name = os.path.basename(val) file_type = detector.from_file(''.join([base_directory, val])) # file_type_map[file_name] = file_type if file_type is not None: result_list.append(file_name) result_list.append(' ') result_list.append(file_type) result_list.append('\n') output_file.write(''.join(result_list)) return
def generateShortDirectory(directory_path, type): urlMapping = {} URL_Mapping = {} count = 0 for subdir, dirs, files in os.walk(directory_path): for file in files: if type == 'UUID1': shortURL = urlShortnerUUID1() elif type == 'UUID4': shortURL = urlShortnerUUID4() elif type == 'HASH': shortURL = hashID(file) count += 1 if count % 100 == 0: print count # if shortURL in urlMapping: # sleep(0.1) # # Again attempting to get another URL # shortURL = urlShortner() # if shortURL in urlMapping: # print 'Again colliding' urlMapping[shortURL] = file obj = { 'content-type': detector.from_file(os.path.join(subdir, file)), 'short_url': shortURL } URL_Mapping[file] = obj print 'Counts' print count print len(urlMapping) return URL_Mapping
def detect_and_move(filelist,mapping): for filename in (filelist): detected_mime = detector.from_file(filename) #if detected_mime in (config.keys()) # mapping[detected_mime].append(filename) mapping[detected_mime].append(filename) return
def computeMIME(self): """ Computation of the mime type of all the files in the solr core :return: the list of MimeTpye object """ ContentTypeList = [] # Getting the files whose meta data would be computed response = MIME_Core().queryAll() files = response.result.dict['response']['docs'] fileIndex = 0 totalFiles = len(files) print 'Adding metadata to the dataset' utility.printProgress(fileIndex, totalFiles, prefix = 'Progress:', suffix = 'Complete', barLength = 50) # Looping over all the files for file in files: # Computing the mime type contentType = str(detector.from_file(file['file'][0])) file['metadata'] = contentType # Appending to the list ContentTypeList.append(file) fileIndex += 1 utility.printProgress(fileIndex, totalFiles, prefix = 'Progress:', suffix = 'Complete', barLength = 50) # Returning the list return ContentTypeList
def __init__(self, filepath): """ Create a new FileElement. :param filepath: Path to the file to wrap. If relative, it is interpreted as relative to the current working directory. :type filepath: str """ super(DataFileElement, self).__init__() # Just expand a user-home `~` if present, keep relative if given. self._filepath = osp.expanduser(filepath) self._content_type = None if magic and osp.isfile(filepath): r = magic.detect_from_filename(filepath) self._content_type = r.mime_type elif tika_detector: try: self._content_type = tika_detector.from_file(filepath) except IOError as ex: self._log.warn( "Failed tika.detector.from_file content type " "detection (error: %s), falling back to file " "extension", str(ex)) # If no tika detector or it failed for some reason if not self._content_type: self._content_type = mimetypes.guess_type(filepath)[0]
def getMimeType(self, filename): """ Get the MIME type of the file. :param filename: :return: """ mime_type = detector.from_file(filename) return str(mime_type)
def detect_threaded(threadid,filelist,mapping,start_index, end_index): print("Start index = {start}, end Index = {end}".format(start = start_index, end= end_index)) for filename in (filelist[start_index],filelist[end_index]): print("Thread {threadid} processing {file}".format(threadid=threadid,file=filename)) detected_mime = detector.from_file(filename) #if detected_mime in (config.keys()) # mapping[detected_mime].append(filename) threading.Lock().acquire() mapping[detected_mime].append(filename) threading.Lock().release() return
def fetchFile(path, END_POINT): fileName = "{0}-{1}".format(ntpath.basename(path), time.time()) # Download File tmpPath = join(END_POINT, fileName) copyfile(path, tmpPath) # Run Tika tp = detector.from_file(path).replace("/", "-") d = join(END_POINT, tp) safeMakeDir(d) saveFile(d, fileName, tmpPath)
def build_media_types(): mt = defaultdict(list) for fmt in FORMATS: outf = "demo.{}".format(fmt) print("Processing {}:".format(outf), end=" ") if not exists(outf): if fmt in ("3gp", "3g2"): cmd = [ ffmpeg.exe_path(), "-i", "master.3gp", "-f", fmt, "-vcodec", "h263", "-vf", "scale=352x288", "-acodec", "amr_nb", "-ar", "8000", "-ac", "1", outf, ] else: cmd = [ ffmpeg.exe_path(), "-i", "master.3gp", "-loglevel", "2", outf ] subprocess.run(cmd) media_type = detector.from_file(abspath(outf)) sigs = video_id.get_frame_vectors(abspath(outf)) vid = video_id.content_id_video(sigs) # os.remove(outf) print("{} -> {} -> {}".format(vid, outf, media_type)) mt[media_type].append(fmt) for m, e in mt.items(): if len(e) == 1: print(f'"{m}": {{"gmt": GMT.VIDEO, "ext": "{e[0]}"}},') else: print(f'"{m}": {{"gmt": GMT.VIDEO, "ext": {e}}},')
def __init__(self, filepath): """ Create a new FileElement. :param filepath: Path to the file to wrap. If relative, it is interpreted as relative to the current working directory. :type filepath: str """ super(DataFileElement, self).__init__() self._filepath = osp.abspath(osp.expanduser(filepath)) self._content_type = None if tika_detector: try: self._content_type = tika_detector.from_file(filepath) except IOError: pass # If no tika detector or it failed for some reason if not self._content_type: self._content_type = mimetypes.guess_type(filepath)[0]
def __init__(self, filepath): """ Create a new FileElement. :param filepath: Path to the file to wrap. If relative, it is interpreted as relative to the current working directory. :type filepath: str """ super(DataFileElement, self).__init__() self._filepath = osp.abspath(osp.expanduser(filepath)) self._content_type = None if tika_detector: try: self._content_type = tika_detector.from_file(filepath) except IOError, ex: self._log.warn( "Failed tika.detector.from_file content type " "detection (error: %s), falling back to file " "extension", str(ex))
def __init__(self, filepath): """ Create a new FileElement. :param filepath: Path to the file to wrap. If relative, it is interpreted as relative to the current working directory. :type filepath: str """ super(DataFileElement, self).__init__() self._filepath = osp.abspath(osp.expanduser(filepath)) self._content_type = None if tika_detector: try: self._content_type = tika_detector.from_file(filepath) except IOError, ex: self._log.warn("Failed tika.detector.from_file content type " "detection (error: %s), falling back to file " "extension", str(ex))
def detect_content_type(filename_or_url: str) -> str: """ Use tika to get the content type of a file or url. TODO there may be faster/ better ways """ content_type = None try: if path.isfile(filename_or_url): content_type = detector.from_file(filename_or_url) else: buffer = requests.get(filename_or_url).content content_type = detector.from_buffer(BytesIO(buffer)) log.info(f"Detected '{content_type}' as content type for: {filename_or_url}") except Exception as e: msg = f"Error detecting content type of '{filename_or_url}' : {str(e)}" log.error(msg) raise Exception(msg) assert content_type return content_type
def file2bib(carrel, file, metadata=None): # configure BIB = 'bib' TXT = 'txt' CACHE = 'cache' COUNT = 24 EXTENSION = '.txt' BIBEXTENSION = '.bib' HEADER = [ 'id', 'author', 'title', 'date', 'pages', 'extension', 'mime', 'words', 'sentence', 'flesch', 'summary', 'cache', 'txt' ] PROCESS = 'textrank' # require from pathlib import Path from textacy import text_stats from tika import detector from tika import parser import os import spacy import pytextrank # initialize authorFound = False dateFound = False titleFound = False title = name2key(file) extension = os.path.splitext(os.path.basename(file))[1] key = name2key(file) pages = '' summary = '' localLibrary = configuration('localLibrary') # debug if VERBOSE: click.echo(('\t%s' % key), err=True) # get the text, and if not, then return; the whole point is to have content to read! parsed = parser.from_file(file) text = parsed['content'] if not text: return # get metadata from the metadata file if str(type(metadata)) == "<class 'pandas.core.frame.DataFrame'>": # parse index = Path(file).name # check to see if the index value exists if index in metadata.index: if 'author' in metadata: author = str(metadata.loc[index]['author']) authorFound = True if 'title' in metadata: title = metadata.loc[index]['title'] titleFound = True if 'date' in metadata: date = str(metadata.loc[index]['date']) dateFound = True # get metadata from the source file metadata = parsed['metadata'] mimetype = detector.from_file(file) # author if authorFound == False: if 'creator' in metadata: author = metadata['creator'] if (isinstance(author, list)): author = author[0] else: author = '' # title if titleFound == False: if 'title' in metadata: title = metadata['title'] if (isinstance(title, list)): title = title[0] title = ' '.join(title.split()) # date if dateFound == False: if 'date' in metadata: date = metadata['date'] if (isinstance(date, list)): date = date[0] date = date[:date.find('T')] else: date = '' # number of pages if 'xmpTPg:NPages' in metadata: pages = metadata['xmpTPg:NPages'] if (isinstance(pages, list)): pages = pages[0] # model the text nlp = spacy.load(MODEL) nlp.max_length = (len(text) + 1) nlp.add_pipe(PROCESS) doc = nlp(text) # summarize summary = summarize(doc) # parse out only the desired statistics words = text_stats.n_words(doc) sentences = text_stats.n_sents(doc) syllables = text_stats.n_syllables(doc) flesch = int(text_stats.readability.flesch_reading_ease(doc)) # cache and text locations txt = Path(TXT) / (key + EXTENSION) cache = Path(CACHE) / (key + extension) # debug if VERBOSE == 2: # provide a review click.echo(' key: ' + key, err=True) click.echo(' author: ' + author, err=True) click.echo(' title: ' + title, err=True) click.echo(' date: ' + date, err=True) click.echo(' extension: ' + extension, err=True) click.echo(' pages: ' + pages, err=True) click.echo(' mime-type: ' + mimetype, err=True) click.echo(' summary: ' + summary, err=True) click.echo(' words: ' + str(words), err=True) click.echo(' sentences: ' + str(sentences), err=True) click.echo(' flesch: ' + str(flesch), err=True) click.echo(' cache: ' + str(cache), err=True) click.echo(' txt: ' + str(txt), err=True) click.echo('', err=True) # open output output = localLibrary / carrel / BIB / (key + BIBEXTENSION) with open(output, 'w', encoding='utf-8') as handle: try: # output the header and the data handle.write('\t'.join(HEADER) + '\n') handle.write('\t'.join([ key, author, title, str(date), pages, extension, mimetype, str(words), str(sentences), str(flesch), summary, str(cache), str(txt) ]) + '\n') # trap weird TypeError except TypeError: click.echo(( "\nWARNING (TypeError): Probably weird author value extracted from PDF file (key: %s). Call Eric.\n" % key), err=True) # check for text, and it should exist; famous last words if text: # configure output and output output = localLibrary / carrel / TXT / (key + EXTENSION) with open(output, 'w', encoding='utf-8') as handle: handle.write(text)
def test_detect_pdf(self): resp = from_file('tika/tests/arguments/Newton.pdf') self.assertEqual(resp, 'application/pdf')
def determineFiletype(filepath): mimetype = detector.from_file(filepath) print(mimetype) if (mimetype in GEOPHYSICS_MIMETYPES): return mimetype, 'Geophysics' if (mimetype in GEOCHEMESTRY_MIMETYPES): return mimetype, 'Geochemistry' return mimetype, 'unknown'
if __name__ == '__main__': chunk = 1 pp = preprocess() pp_nobeta = preprocess(1) temp_path = '' try: #if True: cnt = 0 for root, dirs, files in os.walk(pp.path): for name in files: temp_path = os.path.join(root, name) if os.path.isfile(temp_path) and os.path.getsize( temp_path ) > 0: #and temp_path[-4:] != 'json' and temp_path[-2:] != 'py' and temp_path[-2:] != 'sh' and temp_path[-3:] != 'txt' and temp_path[-5:] != 'Store' and temp_path[-3:] != 'pyc' and temp_path[-3:] != 'csv': filetype = detector.from_file(temp_path) #print filetype table = pp.computeOnlyFingerPrint(temp_path) table.insert(0, filetype) pp.output.append(table) table_nobeta = pp_nobeta.computeOnlyFingerPrint(temp_path) table_nobeta.insert(0, filetype) pp_nobeta.output.append(table_nobeta) cnt += 1 if cnt % 100 == 0: print cnt if cnt > 0 and cnt % 10000 == 0: df = pd.DataFrame(pp.output) df.to_csv('temp_3p_data.csv', sep=',', index=False)
parsedData="" path_to_file = path+"/"+str(file) print path_to_file urltext,typeofrequest,requesthostname,responsehostname,responseheader,responsestatus = getdatafromdoc(path_to_file) data = {} data["url"]=urltext data["typeofrequest"]=typeofrequest data["requesthostname"]=requesthostname data["responsehostname"]=responsehostname data["responsestatus"]=responsestatus data["responseheader"]=responseheader #Get NER data docType = detector.from_file(path_to_file) if docType in tagRatioFileTypes: buffer = handleHtml(path_to_file,docType) else: try: buffer=subprocess.check_output(['java', '-jar', tikaSnapshotPath, '-t', path_to_file]) except: errorFile.write(path_to_file+"\n") continue if (buffer==None): errorFile.write(path_to_file+"\n") continue if (len(buffer)==0): errorFile.write(path_to_file+"\n") #continue
def test_detect_doc(self): resp = from_file('tika/tests/arguments/Newton.doc') self.assertEqual(resp, 'application/msword')
reload(sys) #so that we don't get unicode encode error sys.setdefaultencoding('utf-8') path = "C:\\Users\\HP\\Desktop\\test_data\\" directory = os.listdir(path) start = time.time() print(directory) count = 0 type_dict = dict() for fpath in directory: #parsing through file to extract metadata and content file_path = path + fpath parsed = parser.from_file(file_path) #detecting file type file_type = detector.from_file(file_path) print(file_type) if (file_type) in type_dict: type_dict[file_type].append(directory[count]) else: type_dict[file_type] = [directory[count]] parsed['id'] = str(count) #converting output from python dict to xml xml = dicttoxml(parsed) dic = {'id': str(count), "content": parsed['content']} #dic.update(parsed['metadata'] solr.add([{ "id": str(count), "content": parsed['content'], "payloads": parsed['metadata']
# Function uses regular expression for identifying measurement present in text. """ def getMeasure(buffer): measurements=re.findall(r'\d+[a-zA-Z]+|\d+\s{1,3}[a-zA-Z]+|\d+\.\d+[a-zA-Z]+|\d+\.\d+\s[a-zA-Z]+',buffer) return measurements """ # Main Processing. measurementJson = {} for path, dirs, files in os.walk(path): for file in files: if file not in ".DS_Store": parsedData = "" path_to_file = path + "/" + str(file) print path_to_file docType = detector.from_file(path_to_file) if docType in tagRatioFileTypes: buffer = handleHtml(path_to_file, docType) else: try: buffer = subprocess.check_output( ['java', '-jar', tikaSnapshotPath, '-t', path_to_file]) except: errorFile.write(path_to_file + "\n") continue if (buffer == None): errorFile.write(path_to_file + "\n") continue if (len(buffer) == 0): errorFile.write(path_to_file + "\n") continue
def test_detect(): """test_detect""" from tika import detector print_stars() print(detector.from_file(doc))
#!/usr/bin/env python import os import tika #tika.initVM() from tika import parser from tika import detector #tika.TikaClientOnly = True #parsed = parser.from_file('parser.java') print(detector.from_file('../examples/eml-200/00_eml-200.xml')) home = os.getenv('HOME') tika.tika.TikaServerClasspath = home + '/git/geotopicparser-utils/mime:' + home + '/git/geotopicparser-utils/models/polar' print( detector.from_file( 'https://github.com/chrismattmann/geotopicparser-utils/tree/master/geotopics/polar.geot' ))
#!/usr/bin/env python from __future__ import division import json import os from SolrClient import SolrClient import sys from tika import detector solr = SolrClient('http://localhost:8983/solr') walk_n = sum(len(files) for root, dirs, files in os.walk(sys.argv[1])) walk_i = 0 ratios = {} for root, dirs, files in os.walk(sys.argv[1]): for file in files: path = root + '/' + file file_size = os.stat(path).st_size if file_size == 0: continue mime = detector.from_file(path) sum, n = ratios.get(mime, (0, 0)) ratios[mime] = sum + len(json.dumps(solr.query('collection1', {'q': 'id:' + file}).data['response']['docs'])) / file_size, n + 1 walk_i += 1 print str(walk_i * 100 // walk_n) + '%\r', with open('size-diversity.json', 'w') as f: json.dump({mime: sum / n for mime, (sum, n) in ratios.iteritems()}, f)
print("--------------------------------------------------------------") print("\n\n\n") print("--------------Idioma do arquivo--------------") print("O idioma do texto eh: ", language.from_file(path_pasta_arquivos + arq7), '\n\n') #Detecta o idioma do arquivo print("--------------Traducao arquivo--------------") print(translate.from_file(path_pasta_arquivos + arq7, 'en', 'es')) #Faz uma traducao do idioma de origem print("\n\n") print("--------------Classificacao dos arquivos--------------" ) #Tipos dos arquivos MIME for arquivo in arquivos: print("Nome arquivo: %s \tTipo: %s" % (arquivo, detector.from_file(path_pasta_arquivos + arquivo))) print("\n\n") print("--------------Metadados Audio--------------") parsed = parser.from_file(path_pasta_arquivos + arq4) #Faz um parse do arquivo metadata = parsed["metadata"] print(json.dumps(metadata, indent=4)) #Imprime em um formato melhor print("\n\n") print("--------------Metadados IMG--------------") parsed = parser.from_file(path_pasta_arquivos + arq5) #Faz um parse do arquivo metadata = parsed["metadata"] print(json.dumps(metadata, indent=4)) #Imprime em um formato melhor
def start(): form = ReusableForm(request.form) #solr = pysolr.Solr('http://localhost:8983/solr/', timeout=10) reload(sys) #so that we don't get unicode encode error sys.setdefaultencoding('utf-8') #path = "C:\\Users\\HP\\Desktop\\test_data\\" global path path = request.form['folder'] directory = os.listdir(path) start = time.time() print(directory) count = 0 type_dict = dict() for fpath in directory: #parsing through file to extract metadata and content file_path = path + fpath parsed = parser.from_file(file_path) #detecting file type file_type = detector.from_file(file_path) print(file_type) if (file_type) in type_dict: type_dict[file_type].append(directory[count]) else: type_dict[file_type] = [directory[count]] parsed['id'] = str(count) #converting output from python dict to xml #xml = dicttoxml(parsed) #dic = {'id':str(count),"content":parsed['content']} #dic.update(parsed['metadata'] if (parsed['content'] is None): #print "working" parsed['content'] = "crime" solr.add([{ "id": str(count), "content": parsed['content'], "payloads": parsed['metadata'] }]) #{"id":str(count),"content":parsed['content']} #formatting xml using xml.dom.minidom #dom = parseString(xml) #pretty_output = dom.toprettyxml() count = count + 1 #creating, writing(xml), closing file #f = open(str(count)+ 'webpages','w+') #f.write(pretty_output) #f.close() #time elapsed #avg time between 0.5 and 0.7 seconds #specs: 3.8 GB RAM, i5-4200U, 1.6GHZ x 4 end = time.time() #print(parsed['metadata'].keys()) t_time = end - start print(t_time) #result = solr.search("pegasus") #for r in result: # id = r['id'] #print directory[int(r['id'])] #print path+directory[int(r['id'])] print type_dict return render_template('form2.html', form=form, type=type_dict)
import os from shutil import copyfile import tika from tika import detector # Set the directory you want to start from rootDir = '/media/jaydeep/mySpace/spring2016/599/polarfulldump' for dirName, subdirList, fileList in os.walk(rootDir): print('Found directory: %s' % dirName) for fname in fileList: filetype = detector.from_file(dirName +'/'+ fname) if filetype == 'video/mp4': copyfile(dirName +'/'+ fname, '/media/jaydeep/mySpace/spring2016/599/polarmp4/'+fname) print('\t%s' % dirName +'/'+ fname)
#!/usr/bin/env python import tika #tika.initVM() from tika import parser from tika import detector tika.TikaClientOnly = True #parsed = parser.from_file('parser.java') print(detector.from_file('parser.java')) """ parsed = parser.from_buffer('Good evening, Dave', 'http://tika:9998/tika') print(parsed["metadata"]) print(parsed["content"]) """
def detect_ext(self): raw = detector.from_file(self.file) return mime_to_ext[raw]
#!/usr/bin/env python from __future__ import division import json import os from SolrClient import SolrClient import sys from tika import detector solr = SolrClient('http://localhost:8983/solr') walk_n = sum(len(files) for root, dirs, files in os.walk(sys.argv[1])) walk_i = 0 ratios = {} for root, dirs, files in os.walk(sys.argv[1]): for file in files: path = root + '/' + file file_size = os.stat(path).st_size if file_size == 0: continue mime = detector.from_file(path) sum, n = ratios.get(mime, (0, 0)) ratios[mime] = sum + len( json.dumps( solr.query('collection1', { 'q': 'id:' + file }).data['response']['docs'])) / file_size, n + 1 walk_i += 1 print str(walk_i * 100 // walk_n) + '%\r', with open('size-diversity.json', 'w') as f: json.dump({mime: sum / n for mime, (sum, n) in ratios.iteritems()}, f)
import traceback from tika import detector import sys rootdir=sys.argv[1] #input dir outdir=sys.argv[2] #output dir #dirwalk for root, subdirs, files in os.walk(rootdir): for file in files: try: if(os.path.isdir(os.path.join(root,file))): continue #detect the mime type using Tika detect=detector.from_file(os.path.join(root,file)) result=detect.replace("/","_") #create a directory in the output folder with the mime type name dirname=os.path.join(outdir,result) if not os.path.exists(dirname): os.makedirs(dirname) #move the file to the directory shutil.move(os.path.join(root,file), os.path.join(dirname,file)) if not (os.path.exists(os.path.join(dirname,file))): print("Something went wrong!") except OSError as exc:
exit() # initialize author = '' file = sys.argv[ 1 ] title = os.path.splitext( os.path.basename( file ) )[ 0 ] extension = os.path.splitext( os.path.basename( file ) )[ 1 ] id = title date = '' pages = '' txt = TXT + '/' + id + '.txt' cache = CACHE + '/' + id + extension engine = create_engine( 'sqlite:///' + DATABASE ) # extract mime-type, just in case mimetype = detector.from_file( file ) # extract metadata parsed = parser.from_file( file ) metadata = parsed[ "metadata" ] # get (possible) pre-existing bibliographic values escape = id.replace( "'", "''" ) query = 'SELECT id, author, title, date FROM bib where id is "{}"'.format( escape ) bibliographics = pd.read_sql_query( query, engine, index_col='id' ) # parse author if ( bibliographics.loc[ escape ,'author'] ) : author = bibliographics.loc[ escape,'author'] else : if 'creator' in metadata : author = metadata[ 'creator' ]
#specify a list of directories as input rootdirs=[os.path.normpath("E:/polardata_chosen/"), os.path.normpath("E:/polardata_octet/"), os.path.normpath("E:/polardata_sorted/")] mimetypes={} for rootdir in rootdirs: #dirwalk for root, subdirs, files in os.walk(rootdir): for file in files: try: if(os.path.isdir(os.path.join(root,file))): continue #use tika to detect the mime type detect=detector.from_file(os.path.join(root,file)) #build the dictionary of the mime-type and count of files of that mime type if detect in mimetypes: mimetypes[detect]+=1 else: mimetypes[detect]=1 except OSError as exc: print(exc.strerror) continue except Exception, err: print(traceback.format_exc()) continue #print the mime type counts to a json file