Example #1
0
def detect_files(output_name):
    index_file = '/Users/Frank/PycharmProjects/599assignment1/geo-topic-parser-folder/geo-topic-all-files.txt'
    base_directory = '/Users/Frank/Desktop/fulldump/raw-dataset/'

    output_dir = os.path.dirname(output_name)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(output_name, 'w') as output_file:
        file_list = yaoner.read_index_file(index_file, base_directory)

        # file_type_map = dict()
        result_list = list()
        for idx, val in enumerate(file_list):
            file_name = os.path.basename(val)
            file_type = detector.from_file(''.join([base_directory, val]))
            # file_type_map[file_name] = file_type
            if file_type is not None:
                result_list.append(file_name)
                result_list.append(' ')
                result_list.append(file_type)
                result_list.append('\n')

        output_file.write(''.join(result_list))
    return
Example #2
0
def generateShortDirectory(directory_path, type):

    urlMapping = {}
    URL_Mapping = {}
    count = 0
    for subdir, dirs, files in os.walk(directory_path):
        for file in files:
            if type == 'UUID1':
                shortURL = urlShortnerUUID1()
            elif type == 'UUID4':
                shortURL = urlShortnerUUID4()
            elif type == 'HASH':
                shortURL = hashID(file)
            count += 1

            if count % 100 == 0:
                print count
            # if shortURL in urlMapping:
            #     sleep(0.1)
            #     # Again attempting to get another URL
            #     shortURL = urlShortner()
            #     if shortURL in urlMapping:
            #         print 'Again colliding'

            urlMapping[shortURL] = file
            obj = {
                'content-type': detector.from_file(os.path.join(subdir, file)),
                'short_url': shortURL
            }
            URL_Mapping[file] = obj

    print 'Counts'
    print count
    print len(urlMapping)
    return URL_Mapping
def detect_and_move(filelist,mapping):
	for filename in (filelist):
		detected_mime = detector.from_file(filename)
		#if detected_mime in (config.keys())
		#	mapping[detected_mime].append(filename)
		mapping[detected_mime].append(filename)
	return
Example #4
0
    def computeMIME(self):
        """
        Computation of the mime type of all the files in the solr core
        :return: the list of MimeTpye object
        """

        ContentTypeList = []
        # Getting the files whose meta data would be computed
        response = MIME_Core().queryAll()
        files = response.result.dict['response']['docs']

        fileIndex = 0
        totalFiles = len(files)

        print 'Adding metadata to the dataset'
        utility.printProgress(fileIndex, totalFiles, prefix = 'Progress:', suffix = 'Complete', barLength = 50)

        # Looping over all the files
        for file in files:
            # Computing the mime type
            contentType = str(detector.from_file(file['file'][0]))

            file['metadata'] = contentType

            # Appending to the list
            ContentTypeList.append(file)

            fileIndex += 1
            utility.printProgress(fileIndex, totalFiles, prefix = 'Progress:', suffix = 'Complete', barLength = 50)
        # Returning the list
        return ContentTypeList
Example #5
0
    def __init__(self, filepath):
        """
        Create a new FileElement.

        :param filepath: Path to the file to wrap.  If relative, it is
            interpreted as relative to the current working directory.
        :type filepath: str

        """
        super(DataFileElement, self).__init__()

        # Just expand a user-home `~` if present, keep relative if given.
        self._filepath = osp.expanduser(filepath)

        self._content_type = None
        if magic and osp.isfile(filepath):
            r = magic.detect_from_filename(filepath)
            self._content_type = r.mime_type
        elif tika_detector:
            try:
                self._content_type = tika_detector.from_file(filepath)
            except IOError as ex:
                self._log.warn(
                    "Failed tika.detector.from_file content type "
                    "detection (error: %s), falling back to file "
                    "extension", str(ex))
        # If no tika detector or it failed for some reason
        if not self._content_type:
            self._content_type = mimetypes.guess_type(filepath)[0]
Example #6
0
    def getMimeType(self, filename):
        """
        Get the MIME type of the file.
        :param filename:
        :return:
        """
        mime_type = detector.from_file(filename)

        return str(mime_type)
def detect_threaded(threadid,filelist,mapping,start_index, end_index):
	print("Start index = {start}, end Index = {end}".format(start = start_index, end= end_index))
	for filename in (filelist[start_index],filelist[end_index]):
		print("Thread {threadid} processing {file}".format(threadid=threadid,file=filename))
		detected_mime = detector.from_file(filename)
		#if detected_mime in (config.keys())
		#	mapping[detected_mime].append(filename)
		threading.Lock().acquire()
		mapping[detected_mime].append(filename)
		threading.Lock().release()
	return
def fetchFile(path, END_POINT):
  fileName = "{0}-{1}".format(ntpath.basename(path), time.time())

  # Download File
  tmpPath = join(END_POINT, fileName)
  copyfile(path, tmpPath)

  # Run Tika
  tp = detector.from_file(path).replace("/", "-")

  d = join(END_POINT, tp)

  safeMakeDir(d)

  saveFile(d, fileName, tmpPath)
Example #9
0
def build_media_types():
    mt = defaultdict(list)
    for fmt in FORMATS:
        outf = "demo.{}".format(fmt)
        print("Processing {}:".format(outf), end=" ")
        if not exists(outf):
            if fmt in ("3gp", "3g2"):
                cmd = [
                    ffmpeg.exe_path(),
                    "-i",
                    "master.3gp",
                    "-f",
                    fmt,
                    "-vcodec",
                    "h263",
                    "-vf",
                    "scale=352x288",
                    "-acodec",
                    "amr_nb",
                    "-ar",
                    "8000",
                    "-ac",
                    "1",
                    outf,
                ]
            else:
                cmd = [
                    ffmpeg.exe_path(), "-i", "master.3gp", "-loglevel", "2",
                    outf
                ]
            subprocess.run(cmd)
        media_type = detector.from_file(abspath(outf))
        sigs = video_id.get_frame_vectors(abspath(outf))
        vid = video_id.content_id_video(sigs)
        # os.remove(outf)
        print("{} -> {} -> {}".format(vid, outf, media_type))
        mt[media_type].append(fmt)
    for m, e in mt.items():
        if len(e) == 1:
            print(f'"{m}": {{"gmt": GMT.VIDEO, "ext": "{e[0]}"}},')
        else:
            print(f'"{m}": {{"gmt": GMT.VIDEO, "ext": {e}}},')
Example #10
0
    def __init__(self, filepath):
        """
        Create a new FileElement.

        :param filepath: Path to the file to wrap.  If relative, it is
            interpreted as relative to the current working directory.
        :type filepath: str

        """
        super(DataFileElement, self).__init__()

        self._filepath = osp.abspath(osp.expanduser(filepath))

        self._content_type = None
        if tika_detector:
            try:
                self._content_type = tika_detector.from_file(filepath)
            except IOError:
                pass
        # If no tika detector or it failed for some reason
        if not self._content_type:
            self._content_type = mimetypes.guess_type(filepath)[0]
Example #11
0
    def __init__(self, filepath):
        """
        Create a new FileElement.

        :param filepath: Path to the file to wrap.  If relative, it is
            interpreted as relative to the current working directory.
        :type filepath: str

        """
        super(DataFileElement, self).__init__()

        self._filepath = osp.abspath(osp.expanduser(filepath))

        self._content_type = None
        if tika_detector:
            try:
                self._content_type = tika_detector.from_file(filepath)
            except IOError, ex:
                self._log.warn(
                    "Failed tika.detector.from_file content type "
                    "detection (error: %s), falling back to file "
                    "extension", str(ex))
Example #12
0
    def __init__(self, filepath):
        """
        Create a new FileElement.

        :param filepath: Path to the file to wrap.  If relative, it is
            interpreted as relative to the current working directory.
        :type filepath: str

        """
        super(DataFileElement, self).__init__()

        self._filepath = osp.abspath(osp.expanduser(filepath))

        self._content_type = None
        if tika_detector:
            try:
                self._content_type = tika_detector.from_file(filepath)
            except IOError, ex:
                self._log.warn("Failed tika.detector.from_file content type "
                               "detection (error: %s), falling back to file "
                               "extension",
                               str(ex))
Example #13
0
    def __init__(self, filepath):
        """
        Create a new FileElement.

        :param filepath: Path to the file to wrap.  If relative, it is
            interpreted as relative to the current working directory.
        :type filepath: str

        """
        super(DataFileElement, self).__init__()

        self._filepath = osp.abspath(osp.expanduser(filepath))

        self._content_type = None
        if tika_detector:
            try:
                self._content_type = tika_detector.from_file(filepath)
            except IOError:
                pass
        # If no tika detector or it failed for some reason
        if not self._content_type:
            self._content_type = mimetypes.guess_type(filepath)[0]
Example #14
0
def detect_content_type(filename_or_url: str) -> str:
    """
    Use tika to get the content type of a file or url.
    TODO there may be faster/ better ways
    """
    content_type = None
    try:
        if path.isfile(filename_or_url):

            content_type = detector.from_file(filename_or_url)
        else:
            buffer = requests.get(filename_or_url).content
            content_type = detector.from_buffer(BytesIO(buffer))

        log.info(f"Detected '{content_type}' as content type for: {filename_or_url}")

    except Exception as e:
        msg = f"Error detecting content type of '{filename_or_url}' : {str(e)}"
        log.error(msg)
        raise Exception(msg)

    assert content_type

    return content_type
Example #15
0
def file2bib(carrel, file, metadata=None):

    # configure
    BIB = 'bib'
    TXT = 'txt'
    CACHE = 'cache'
    COUNT = 24
    EXTENSION = '.txt'
    BIBEXTENSION = '.bib'
    HEADER = [
        'id', 'author', 'title', 'date', 'pages', 'extension', 'mime', 'words',
        'sentence', 'flesch', 'summary', 'cache', 'txt'
    ]
    PROCESS = 'textrank'

    # require
    from pathlib import Path
    from textacy import text_stats
    from tika import detector
    from tika import parser
    import os
    import spacy
    import pytextrank

    # initialize
    authorFound = False
    dateFound = False
    titleFound = False
    title = name2key(file)
    extension = os.path.splitext(os.path.basename(file))[1]
    key = name2key(file)
    pages = ''
    summary = ''
    localLibrary = configuration('localLibrary')

    # debug
    if VERBOSE: click.echo(('\t%s' % key), err=True)

    # get the text, and if not, then return; the whole point is to have content to read!
    parsed = parser.from_file(file)
    text = parsed['content']
    if not text: return

    # get metadata from the metadata file
    if str(type(metadata)) == "<class 'pandas.core.frame.DataFrame'>":

        # parse
        index = Path(file).name

        # check to see if the index value exists
        if index in metadata.index:

            if 'author' in metadata:

                author = str(metadata.loc[index]['author'])
                authorFound = True

            if 'title' in metadata:

                title = metadata.loc[index]['title']
                titleFound = True

            if 'date' in metadata:

                date = str(metadata.loc[index]['date'])
                dateFound = True

    # get metadata from the source file
    metadata = parsed['metadata']
    mimetype = detector.from_file(file)

    # author
    if authorFound == False:

        if 'creator' in metadata:
            author = metadata['creator']
            if (isinstance(author, list)): author = author[0]

        else: author = ''

    # title
    if titleFound == False:

        if 'title' in metadata:
            title = metadata['title']
            if (isinstance(title, list)): title = title[0]
            title = ' '.join(title.split())

    # date
    if dateFound == False:

        if 'date' in metadata:
            date = metadata['date']
            if (isinstance(date, list)): date = date[0]
            date = date[:date.find('T')]

        else:
            date = ''

    # number of pages
    if 'xmpTPg:NPages' in metadata:
        pages = metadata['xmpTPg:NPages']
        if (isinstance(pages, list)): pages = pages[0]

    # model the text
    nlp = spacy.load(MODEL)
    nlp.max_length = (len(text) + 1)
    nlp.add_pipe(PROCESS)
    doc = nlp(text)

    # summarize
    summary = summarize(doc)

    # parse out only the desired statistics
    words = text_stats.n_words(doc)
    sentences = text_stats.n_sents(doc)
    syllables = text_stats.n_syllables(doc)
    flesch = int(text_stats.readability.flesch_reading_ease(doc))

    # cache and text locations
    txt = Path(TXT) / (key + EXTENSION)
    cache = Path(CACHE) / (key + extension)

    # debug
    if VERBOSE == 2:

        # provide a review
        click.echo('        key: ' + key, err=True)
        click.echo('     author: ' + author, err=True)
        click.echo('      title: ' + title, err=True)
        click.echo('       date: ' + date, err=True)
        click.echo('  extension: ' + extension, err=True)
        click.echo('      pages: ' + pages, err=True)
        click.echo('  mime-type: ' + mimetype, err=True)
        click.echo('    summary: ' + summary, err=True)
        click.echo('      words: ' + str(words), err=True)
        click.echo('  sentences: ' + str(sentences), err=True)
        click.echo('     flesch: ' + str(flesch), err=True)
        click.echo('      cache: ' + str(cache), err=True)
        click.echo('        txt: ' + str(txt), err=True)
        click.echo('', err=True)

    # open output
    output = localLibrary / carrel / BIB / (key + BIBEXTENSION)
    with open(output, 'w', encoding='utf-8') as handle:

        try:

            # output the header and the data
            handle.write('\t'.join(HEADER) + '\n')
            handle.write('\t'.join([
                key, author, title,
                str(date), pages, extension, mimetype,
                str(words),
                str(sentences),
                str(flesch), summary,
                str(cache),
                str(txt)
            ]) + '\n')

        # trap weird TypeError
        except TypeError:
            click.echo((
                "\nWARNING (TypeError): Probably weird author value extracted from PDF file (key: %s). Call Eric.\n"
                % key),
                       err=True)

    # check for text, and it should exist; famous last words
    if text:

        # configure output and output
        output = localLibrary / carrel / TXT / (key + EXTENSION)
        with open(output, 'w', encoding='utf-8') as handle:
            handle.write(text)
Example #16
0
 def test_detect_pdf(self):
     resp = from_file('tika/tests/arguments/Newton.pdf')
     self.assertEqual(resp, 'application/pdf')
Example #17
0
def determineFiletype(filepath):
    mimetype = detector.from_file(filepath)
    print(mimetype)
    if (mimetype in GEOPHYSICS_MIMETYPES): return mimetype, 'Geophysics'
    if (mimetype in GEOCHEMESTRY_MIMETYPES): return mimetype, 'Geochemistry'
    return mimetype, 'unknown'
if __name__ == '__main__':

    chunk = 1
    pp = preprocess()
    pp_nobeta = preprocess(1)
    temp_path = ''
    try:
        #if True:
        cnt = 0
        for root, dirs, files in os.walk(pp.path):
            for name in files:
                temp_path = os.path.join(root, name)
                if os.path.isfile(temp_path) and os.path.getsize(
                        temp_path
                ) > 0:  #and temp_path[-4:] != 'json' and temp_path[-2:] != 'py' and temp_path[-2:] != 'sh' and temp_path[-3:] != 'txt' and temp_path[-5:] != 'Store' and temp_path[-3:] != 'pyc' and temp_path[-3:] != 'csv':
                    filetype = detector.from_file(temp_path)
                    #print filetype
                    table = pp.computeOnlyFingerPrint(temp_path)
                    table.insert(0, filetype)
                    pp.output.append(table)

                    table_nobeta = pp_nobeta.computeOnlyFingerPrint(temp_path)
                    table_nobeta.insert(0, filetype)
                    pp_nobeta.output.append(table_nobeta)

                    cnt += 1
                if cnt % 100 == 0:
                    print cnt
                if cnt > 0 and cnt % 10000 == 0:
                    df = pd.DataFrame(pp.output)
                    df.to_csv('temp_3p_data.csv', sep=',', index=False)
            parsedData=""
            path_to_file = path+"/"+str(file)
            print path_to_file

            urltext,typeofrequest,requesthostname,responsehostname,responseheader,responsestatus = getdatafromdoc(path_to_file)

            data = {}
            data["url"]=urltext
            data["typeofrequest"]=typeofrequest
            data["requesthostname"]=requesthostname
            data["responsehostname"]=responsehostname
            data["responsestatus"]=responsestatus
            data["responseheader"]=responseheader

            #Get NER data
            docType = detector.from_file(path_to_file)
            if docType in tagRatioFileTypes:
                buffer = handleHtml(path_to_file,docType)
            else:
                try:
                    buffer=subprocess.check_output(['java', '-jar', tikaSnapshotPath, '-t', path_to_file])
                except:
                    errorFile.write(path_to_file+"\n")
                    continue
            if (buffer==None):
                errorFile.write(path_to_file+"\n")
                continue
            if (len(buffer)==0):
                    errorFile.write(path_to_file+"\n")
                    #continue
Example #20
0
 def test_detect_doc(self):
     resp = from_file('tika/tests/arguments/Newton.doc')
     self.assertEqual(resp, 'application/msword')
Example #21
0
reload(sys)
#so that we don't get unicode encode error
sys.setdefaultencoding('utf-8')
path = "C:\\Users\\HP\\Desktop\\test_data\\"
directory = os.listdir(path)
start = time.time()
print(directory)
count = 0
type_dict = dict()
for fpath in directory:
    #parsing through file to extract metadata and content
    file_path = path + fpath
    parsed = parser.from_file(file_path)

    #detecting file type
    file_type = detector.from_file(file_path)
    print(file_type)
    if (file_type) in type_dict:
        type_dict[file_type].append(directory[count])
    else:
        type_dict[file_type] = [directory[count]]
    parsed['id'] = str(count)

    #converting output from python dict to xml
    xml = dicttoxml(parsed)
    dic = {'id': str(count), "content": parsed['content']}
    #dic.update(parsed['metadata']
    solr.add([{
        "id": str(count),
        "content": parsed['content'],
        "payloads": parsed['metadata']
Example #22
0
# Function uses regular expression for identifying measurement present in text.
"""
def getMeasure(buffer):
    measurements=re.findall(r'\d+[a-zA-Z]+|\d+\s{1,3}[a-zA-Z]+|\d+\.\d+[a-zA-Z]+|\d+\.\d+\s[a-zA-Z]+',buffer)
    return measurements
"""

# Main Processing.
measurementJson = {}
for path, dirs, files in os.walk(path):
    for file in files:
        if file not in ".DS_Store":
            parsedData = ""
            path_to_file = path + "/" + str(file)
            print path_to_file
            docType = detector.from_file(path_to_file)
            if docType in tagRatioFileTypes:
                buffer = handleHtml(path_to_file, docType)
            else:
                try:
                    buffer = subprocess.check_output(
                        ['java', '-jar', tikaSnapshotPath, '-t', path_to_file])
                except:
                    errorFile.write(path_to_file + "\n")
                    continue
            if (buffer == None):
                errorFile.write(path_to_file + "\n")
                continue
            if (len(buffer) == 0):
                errorFile.write(path_to_file + "\n")
                continue
Example #23
0
def test_detect():
    """test_detect"""
    from tika import detector
    print_stars()
    print(detector.from_file(doc))
Example #24
0
#!/usr/bin/env python
import os
import tika
#tika.initVM()
from tika import parser
from tika import detector
#tika.TikaClientOnly = True
#parsed = parser.from_file('parser.java')
print(detector.from_file('../examples/eml-200/00_eml-200.xml'))

home = os.getenv('HOME')
tika.tika.TikaServerClasspath = home + '/git/geotopicparser-utils/mime:' + home + '/git/geotopicparser-utils/models/polar'
print(
    detector.from_file(
        'https://github.com/chrismattmann/geotopicparser-utils/tree/master/geotopics/polar.geot'
    ))
#!/usr/bin/env python

from __future__ import division
import json
import os
from SolrClient import SolrClient
import sys
from tika import detector

solr = SolrClient('http://localhost:8983/solr')
walk_n = sum(len(files) for root, dirs, files in os.walk(sys.argv[1]))
walk_i = 0
ratios = {}
for root, dirs, files in os.walk(sys.argv[1]):
    for file in files:
        path = root + '/' + file
        file_size = os.stat(path).st_size
        if file_size == 0: continue
        mime = detector.from_file(path)
        sum, n = ratios.get(mime, (0, 0))
        ratios[mime] = sum + len(json.dumps(solr.query('collection1', {'q': 'id:' + file}).data['response']['docs'])) / file_size, n + 1
        walk_i += 1
        print str(walk_i * 100 // walk_n) + '%\r',
with open('size-diversity.json', 'w') as f:
    json.dump({mime: sum / n for mime, (sum, n) in ratios.iteritems()}, f)
Example #26
0
    print("--------------------------------------------------------------")
    print("\n\n\n")

print("--------------Idioma do arquivo--------------")
print("O idioma do texto eh: ", language.from_file(path_pasta_arquivos + arq7),
      '\n\n')  #Detecta o idioma do arquivo

print("--------------Traducao arquivo--------------")
print(translate.from_file(path_pasta_arquivos + arq7, 'en',
                          'es'))  #Faz uma traducao do idioma de origem

print("\n\n")

print("--------------Classificacao dos arquivos--------------"
      )  #Tipos dos arquivos MIME
for arquivo in arquivos:
    print("Nome arquivo: %s \tTipo: %s" %
          (arquivo, detector.from_file(path_pasta_arquivos + arquivo)))

print("\n\n")
print("--------------Metadados Audio--------------")
parsed = parser.from_file(path_pasta_arquivos + arq4)  #Faz um parse do arquivo
metadata = parsed["metadata"]
print(json.dumps(metadata, indent=4))  #Imprime em um formato melhor

print("\n\n")
print("--------------Metadados IMG--------------")
parsed = parser.from_file(path_pasta_arquivos + arq5)  #Faz um parse do arquivo
metadata = parsed["metadata"]
print(json.dumps(metadata, indent=4))  #Imprime em um formato melhor
Example #27
0
def start():
    form = ReusableForm(request.form)
    #solr = pysolr.Solr('http://localhost:8983/solr/', timeout=10)

    reload(sys)
    #so that we don't get unicode encode error
    sys.setdefaultencoding('utf-8')
    #path = "C:\\Users\\HP\\Desktop\\test_data\\"
    global path
    path = request.form['folder']

    directory = os.listdir(path)
    start = time.time()
    print(directory)
    count = 0
    type_dict = dict()
    for fpath in directory:
        #parsing through file to extract metadata and content
        file_path = path + fpath
        parsed = parser.from_file(file_path)

        #detecting file type
        file_type = detector.from_file(file_path)
        print(file_type)
        if (file_type) in type_dict:
            type_dict[file_type].append(directory[count])
        else:
            type_dict[file_type] = [directory[count]]
        parsed['id'] = str(count)

        #converting output from python dict to xml
        #xml = dicttoxml(parsed)
        #dic = {'id':str(count),"content":parsed['content']}
        #dic.update(parsed['metadata']
        if (parsed['content'] is None):
            #print "working"
            parsed['content'] = "crime"

        solr.add([{
            "id": str(count),
            "content": parsed['content'],
            "payloads": parsed['metadata']
        }])
        #{"id":str(count),"content":parsed['content']}

        #formatting xml using xml.dom.minidom
        #dom = parseString(xml)
        #pretty_output = dom.toprettyxml()
        count = count + 1

        #creating, writing(xml), closing file
        #f = open(str(count)+ 'webpages','w+')
        #f.write(pretty_output)
        #f.close()

    #time elapsed
    #avg time between 0.5 and 0.7 seconds
    #specs: 3.8 GB RAM, i5-4200U, 1.6GHZ x 4
    end = time.time()
    #print(parsed['metadata'].keys())
    t_time = end - start
    print(t_time)
    #result = solr.search("pegasus")
    #for r in result:
    #	id = r['id']
    #print directory[int(r['id'])]
    #print path+directory[int(r['id'])]
    print type_dict
    return render_template('form2.html', form=form, type=type_dict)
Example #28
0
import os
from shutil import copyfile
import tika
from tika import detector

# Set the directory you want to start from
rootDir = '/media/jaydeep/mySpace/spring2016/599/polarfulldump'

for dirName, subdirList, fileList in os.walk(rootDir):
    
    print('Found directory: %s' % dirName)
    for fname in fileList:
	filetype = detector.from_file(dirName +'/'+ fname)
	if filetype == 'video/mp4':
		copyfile(dirName +'/'+ fname, '/media/jaydeep/mySpace/spring2016/599/polarmp4/'+fname)
	
        print('\t%s' % dirName +'/'+ fname)
	
Example #29
0
#!/usr/bin/env python
import tika
#tika.initVM()
from tika import parser
from tika import detector

tika.TikaClientOnly = True
#parsed = parser.from_file('parser.java')
print(detector.from_file('parser.java'))
"""
parsed  = parser.from_buffer('Good evening, Dave', 'http://tika:9998/tika')
print(parsed["metadata"])
print(parsed["content"])
"""
Example #30
0
	def detect_ext(self):
		raw = detector.from_file(self.file)
		return mime_to_ext[raw]
Example #31
0
#!/usr/bin/env python

from __future__ import division
import json
import os
from SolrClient import SolrClient
import sys
from tika import detector

solr = SolrClient('http://localhost:8983/solr')
walk_n = sum(len(files) for root, dirs, files in os.walk(sys.argv[1]))
walk_i = 0
ratios = {}
for root, dirs, files in os.walk(sys.argv[1]):
    for file in files:
        path = root + '/' + file
        file_size = os.stat(path).st_size
        if file_size == 0: continue
        mime = detector.from_file(path)
        sum, n = ratios.get(mime, (0, 0))
        ratios[mime] = sum + len(
            json.dumps(
                solr.query('collection1', {
                    'q': 'id:' + file
                }).data['response']['docs'])) / file_size, n + 1
        walk_i += 1
        print str(walk_i * 100 // walk_n) + '%\r',
with open('size-diversity.json', 'w') as f:
    json.dump({mime: sum / n for mime, (sum, n) in ratios.iteritems()}, f)
import traceback
from tika import detector
import sys


rootdir=sys.argv[1] #input dir
outdir=sys.argv[2]   #output dir

#dirwalk
for root, subdirs, files in os.walk(rootdir):
    for file in files:
        try:
            if(os.path.isdir(os.path.join(root,file))):
                continue
            #detect the mime type using Tika
            detect=detector.from_file(os.path.join(root,file))
            result=detect.replace("/","_")


            #create a directory in the output folder with the mime type name
            dirname=os.path.join(outdir,result)

            if not os.path.exists(dirname):
                os.makedirs(dirname)

            #move the file to the directory
            shutil.move(os.path.join(root,file), os.path.join(dirname,file))
            if not (os.path.exists(os.path.join(dirname,file))):
                print("Something went wrong!")

        except OSError as exc:
Example #33
0
	exit()

# initialize
author     = ''
file       = sys.argv[ 1 ]
title      = os.path.splitext( os.path.basename( file ) )[ 0 ]
extension  = os.path.splitext( os.path.basename( file ) )[ 1 ]
id         = title
date       = ''
pages      = ''
txt        = TXT + '/' + id + '.txt'
cache      = CACHE + '/' + id + extension
engine     = create_engine( 'sqlite:///' + DATABASE )

# extract mime-type, just in case
mimetype = detector.from_file( file )

# extract metadata
parsed = parser.from_file( file )
metadata = parsed[ "metadata" ] 

# get (possible) pre-existing bibliographic values
escape = id.replace( "'", "''" )
query          = 'SELECT id, author, title, date FROM bib where id is "{}"'.format( escape )
bibliographics = pd.read_sql_query( query, engine, index_col='id' )

# parse author
if ( bibliographics.loc[ escape ,'author'] ) : author = bibliographics.loc[ escape,'author']
else :
	if 'creator' in metadata :
		author = metadata[ 'creator' ]
#specify a list of directories as input
rootdirs=[os.path.normpath("E:/polardata_chosen/"),
          os.path.normpath("E:/polardata_octet/"),
          os.path.normpath("E:/polardata_sorted/")]

mimetypes={}
for rootdir in rootdirs:
    #dirwalk
    for root, subdirs, files in os.walk(rootdir):
        for file in files:
            try:

                if(os.path.isdir(os.path.join(root,file))):
                    continue
                #use tika to detect the mime type
                detect=detector.from_file(os.path.join(root,file))

                #build the dictionary of the mime-type and count of files of that mime type
                if detect in mimetypes:
                    mimetypes[detect]+=1
                else:
                    mimetypes[detect]=1
            except OSError as exc:
                print(exc.strerror)
                continue
            except Exception, err:
                print(traceback.format_exc())
                continue


#print the mime type counts to a json file