Ejemplo n.º 1
0
def index(search_dir, index_dir):
    cmd = 'pdftotext "%s" %s/loog.txt'
    dirs, files = rsync.ls(search_dir)
    files = [(f, size) for (f, size) in files if '.pdf' in f]
    N = len(files)
    A = sps.lil_matrix((N, cols))
    print A.shape
    df_files = []
    for i, (f, size) in enumerate(files):
        file = f.replace("\\", "/")
        print file
        if ".pdf" in file:
            cmd2 = cmd % (f, os.environ['TEMP'])
            os.system(cmd2)
            lowers = open(
                "%s/loog.txt" %
                os.environ['TEMP']).read().decode("ISO-8859-1").lower()
            tokens = nltk.word_tokenize(lowers)
            tokens = stem_tokens(tokens)
            print tokens[:30]
            for token in tokens:
                A[i, hash(token) % cols] += 1
            df_files.append([file, size])

    df = A.copy()
    df[df > 0] = 1.
    df = np.array(df.sum(axis=0))
    idf = df.copy()
    idf[df.nonzero()] = np.log(N / df[df.nonzero()])
    io.mmwrite(index_dir + "/loogle_idf.mtx", idf)

    tf = A.copy().tocoo()
    tf.data = 1 + np.log(tf.data)
    tfidf = sps.csr_matrix(tf.multiply(idf))
    tfidf = normalize(tfidf, norm='l2', axis=1)
    io.mmwrite(index_dir + "/loogle_tfidf.mtx", tfidf)
    df_files = pd.DataFrame(df_files, columns=['file', 'size'])
    df_files.to_csv(index_dir + "/loogle_files.csv", index=None)
Ejemplo n.º 2
0
        self.p = subprocess.Popen(
            [exe, "-autoexit", "-fs", "%s" % file],
            stdout=subprocess.PIPE,
            shell=True)
        self.p.wait()

    def term(self):
        print 'terminating', self.p.pid
        os.system("kill -f %d" % self.p.pid)


if __name__ == "__main__":

    base_dir = "e:/shows"

    dirs, files = ls(base_dir)

    files = [
        x[0] for x in files if ".avi" in x[0] or "mkv" in x[0] or "mp4" in x[0]
    ]

    fout = open("%s/%s" % (os.environ['TEMP'], logfile), "a")
    while (True):
        seed = random.choice(range(10000))
        rnd = twister.get_random_numbers(seed, 1)
        print len(files), 'files'
        rnd = rnd[0] % len(files)
        file = files[rnd]
        file = file.replace('/', '\\')
        fout.write(file)
        fout.write("\n")
Ejemplo n.º 3
0
def index(crawl_dir,index_dir,get_first_N=None):

    df_files = []; file_indexed_dict = {} ; index = None
    if os.path.isfile(index_dir + "/loogle_files.csv" ):
        df = pd.read_csv(index_dir + "/loogle_files.csv")
        df_files = df[['file','size']].values.tolist()
        file_indexed_dict = df.set_index('file').to_dict()['size']
        index = open_dir(index_dir)
    else:     
        pdf_schema = Schema(path = ID(stored=True),  
                            title = TEXT(stored=True), 
                            text = TEXT) 
        index = create_in(index_dir, pdf_schema)
        
    writer = index.writer() 

    # get all potential files to be indexed
    dirs, files = rsync.ls(crawl_dir)
    files = [(f.replace("\\","/"),size) for (f,size) in files ]
    files = [(f,size) for (f,size) in files if os.path.splitext(f)[1] in exts]

    files_crawled_dict = dict(files)
    tmp = {} # needed bcz cannot change file_indexed_dict while iterating
    for ff in file_indexed_dict:
        # remove file from index if file exists in index, but not on
        # file system
        if ff not in files_crawled_dict:
            print ff, 'removed'
            writer.delete_by_term('path', unicode(ff))
            tmp[ff] = file_indexed_dict[ff]
        elif files_crawled_dict[ff] != file_indexed_dict[ff]:
            # this is the only section we do not add to tmp this is
            # how I remove an updated file from my index dictionary so
            # its absence will be detected below, and will be freshly
            # reindexed
            print ff, 'size different update'
            writer.delete_by_term('path', unicode(ff))
        else:
            tmp[ff] = file_indexed_dict[ff]

    # put it back in
    file_indexed_dict = tmp
                
    if get_first_N: files = files[:get_first_N]
        
    for i,(file,size) in enumerate(files):
        if file in file_indexed_dict:
            print 'skipping', file
            continue
        print 'processing', file
        ext = os.path.splitext(file)[1]
        if ext == ".pdf" :
            cmd = pdfcmd % (file,os.environ['TEMP'])
            os.system(cmd)
        elif ext == ".djvu":
            cmd = djvucmd % (file,os.environ['TEMP'])
            os.system(cmd)
            os.system("unix2dos -7 %s/loog.txt" % os.environ['TEMP'])
        elif ext == ".html":
            with codecs.open(file, encoding='utf-8') as f:
                content = f.read()
            content = webarticle2text.extractFromHTML(content)
            fout = open("%s/loog.txt" % os.environ['TEMP'],"w")
            fout.write(content.encode("utf8"))
            fout.close()            
        elif ext == ".txt":
            shutil.copy(file, "%s/loog.txt" % os.environ['TEMP'])
            

        # turn the file name itself into content as well just in case,
        # if text conversion does not output anything, at least we can
        # use some information from the file name for search
        filename_as_content = os.path.basename(file).replace("_"," ").replace("-"," ")
        filename_as_content = filename_as_content.decode("latin-1")
        for x in exts: filename_as_content = filename_as_content.replace(x,"")
        filename_as_content += " "
        
        with codecs.open("%s/loog.txt" % os.environ['TEMP'], encoding='utf-8') as f:
            content = f.read()
        writer.add_document(path = unicode(file),
                            title = unicode(filename_as_content),
                            text = unicode(content))
        df_files.append([file, size])

    writer.commit() 
    df_files = pd.DataFrame(df_files,columns=['file','size'])
    df_files.to_csv(index_dir + "/loogle_files.csv",index=None)
Ejemplo n.º 4
0
# Plays mp3 files found under sys.argv[1] one by one, randomly. 
# Meant to simulate a radio.
import pyaudio, struct
import glob, os, random, sys
import threading, numpy as np
import datetime, random
from rsync import ls
import select, rndplay

fout = open("/tmp/vidplay.out","w")

while True:
    print "Music Dir", sys.argv[1]    
    dirs,list = ls(sys.argv[1])
    print "Files", len(list)
    idx = rndplay.my_random(len(list))
    print "show idx selected", idx, "song", list[idx][0]
    fout.write(str(list[idx][0]) + "\n")
    fout.flush()
    print '\n'
    #cmd = "/usr/bin/ffplay -nodisp '%s'" % list[idx]
    cmd = "mplayer '%s' -fs " % list[idx][0]
    print cmd
    os.system(cmd)
    print "Delete? (Press d for delete)..."
    k=""
    def input():
        global k
        i = 0
        while i < 1:
            i = i + 1
Ejemplo n.º 5
0
        threading.Thread.__init__(self)
        self.file = file
    def run(self):
        print self.file
        self.p = subprocess.Popen([exe, "-autoexit", "-fs", "%s" % file], 
                                  stdout=subprocess.PIPE, shell=True)
        self.p.wait()
    def term(self):
        print 'terminating', self.p.pid
        os.system("kill -f %d" % self.p.pid)

if __name__ == "__main__": 

    base_dir = "/media/burak/New Volume/shows"

    dirs,files = ls(base_dir)
    
    files = [x[0] for x in files if ".avi" in x[0] or "mkv" in x[0] or "mp4" in x[0]]

    fout = open("%s/%s" % (os.environ['TEMP'],logfile), "a")
    while (True):        
        rnd = random.choice(range(len(files)))
        print len(files), 'files'
        file = files[rnd]
        file = file.replace('/','\\')
        fout.write(file)
        fout.write("\n")
        fout.flush()
        t = Runner(file)
        t.start()
        while True: 
Ejemplo n.º 6
0
# Plays mp3 files found under sys.argv[1] one by one, randomly. 
# Meant to simulate a radio.
import glob, os, random, sys
import threading
import select
from rsync import ls

fout = open("/tmp/vidplay.out","w")

while True:
    print "Music Dir", sys.argv[1]    
    dirs,list = ls(sys.argv[1])
    idx = random.choice(range(len(list)))
    print "show idx selected", idx, "song", list[idx][0]
    fout.write(str(list[idx][0]) + "\n")
    fout.flush()
    print '\n'
    #cmd = "/usr/bin/ffplay -nodisp '%s'" % list[idx]
    cmd = "mplayer '%s' -fs " % list[idx][0]
    print cmd
    os.system(cmd)
    print "Delete? (Press d for delete)..."
    k=""
    def input():
        global k
        i = 0
        while i < 1:
            i = i + 1
            r,w,x = select.select([sys.stdin.fileno()],[],[],2)
            if len(r) != 0:
                k  =sys.stdin.readline()
Ejemplo n.º 7
0
def index(crawl_dir, index_dir, get_first_N=None):

    df_files = []
    file_indexed_dict = {}
    index = None
    if os.path.isfile(index_dir + "/loogle_files.csv"):
        df = pd.read_csv(index_dir + "/loogle_files.csv")
        df_files = df[['file', 'size']].values.tolist()
        file_indexed_dict = df.set_index('file').to_dict()['size']
        index = open_dir(index_dir)
    else:
        pdf_schema = Schema(path=ID(stored=True),
                            title=TEXT(stored=True),
                            text=TEXT)
        index = create_in(index_dir, pdf_schema)

    writer = index.writer()

    # get all potential files to be indexed
    dirs, files = rsync.ls(crawl_dir)
    files = [(f.replace("\\", "/"), size) for (f, size) in files]
    files = [(f, size) for (f, size) in files
             if os.path.splitext(f)[1] in exts]

    files_crawled_dict = dict(files)
    tmp = {}  # needed bcz cannot change file_indexed_dict while iterating
    for ff in file_indexed_dict:
        # remove file from index if file exists in index, but not on
        # file system
        if ff not in files_crawled_dict:
            print ff, 'removed'
            writer.delete_by_term('path', unicode(ff))
            tmp[ff] = file_indexed_dict[ff]
        elif files_crawled_dict[ff] != file_indexed_dict[ff]:
            # this is the only section we do not add to tmp this is
            # how I remove an updated file from my index dictionary so
            # its absence will be detected below, and will be freshly
            # reindexed
            print ff, 'size different update'
            writer.delete_by_term('path', unicode(ff))
        else:
            tmp[ff] = file_indexed_dict[ff]

    # put it back in
    file_indexed_dict = tmp

    if get_first_N: files = files[:get_first_N]

    for i, (file, size) in enumerate(files):
        if file in file_indexed_dict:
            print 'skipping', file
            continue
        print 'processing', file
        ext = os.path.splitext(file)[1]
        if ext == ".pdf":
            cmd = pdfcmd % (file, os.environ['TEMP'])
            os.system(cmd)
        elif ext == ".djvu":
            cmd = djvucmd % (file, os.environ['TEMP'])
            os.system(cmd)
            os.system("unix2dos -7 %s/loog.txt" % os.environ['TEMP'])
        elif ext == ".html":
            with codecs.open(file, encoding='utf-8') as f:
                content = f.read()
            content = webarticle2text.extractFromHTML(content)
            fout = open("%s/loog.txt" % os.environ['TEMP'], "w")
            fout.write(content.encode("utf8"))
            fout.close()
        elif ext == ".txt":
            shutil.copy(file, "%s/loog.txt" % os.environ['TEMP'])

        # turn the file name itself into content as well just in case,
        # if text conversion does not output anything, at least we can
        # use some information from the file name for search
        filename_as_content = os.path.basename(file).replace("_", " ").replace(
            "-", " ")
        filename_as_content = filename_as_content.decode("latin-1")
        for x in exts:
            filename_as_content = filename_as_content.replace(x, "")
        filename_as_content += " "

        with codecs.open("%s/loog.txt" % os.environ['TEMP'],
                         encoding='utf-8') as f:
            content = f.read()
        writer.add_document(path=unicode(file),
                            title=unicode(filename_as_content),
                            text=unicode(content))
        df_files.append([file, size])

    writer.commit()
    df_files = pd.DataFrame(df_files, columns=['file', 'size'])
    df_files.to_csv(index_dir + "/loogle_files.csv", index=None)