def index(search_dir, index_dir): cmd = 'pdftotext "%s" %s/loog.txt' dirs, files = rsync.ls(search_dir) files = [(f, size) for (f, size) in files if '.pdf' in f] N = len(files) A = sps.lil_matrix((N, cols)) print A.shape df_files = [] for i, (f, size) in enumerate(files): file = f.replace("\\", "/") print file if ".pdf" in file: cmd2 = cmd % (f, os.environ['TEMP']) os.system(cmd2) lowers = open( "%s/loog.txt" % os.environ['TEMP']).read().decode("ISO-8859-1").lower() tokens = nltk.word_tokenize(lowers) tokens = stem_tokens(tokens) print tokens[:30] for token in tokens: A[i, hash(token) % cols] += 1 df_files.append([file, size]) df = A.copy() df[df > 0] = 1. df = np.array(df.sum(axis=0)) idf = df.copy() idf[df.nonzero()] = np.log(N / df[df.nonzero()]) io.mmwrite(index_dir + "/loogle_idf.mtx", idf) tf = A.copy().tocoo() tf.data = 1 + np.log(tf.data) tfidf = sps.csr_matrix(tf.multiply(idf)) tfidf = normalize(tfidf, norm='l2', axis=1) io.mmwrite(index_dir + "/loogle_tfidf.mtx", tfidf) df_files = pd.DataFrame(df_files, columns=['file', 'size']) df_files.to_csv(index_dir + "/loogle_files.csv", index=None)
self.p = subprocess.Popen( [exe, "-autoexit", "-fs", "%s" % file], stdout=subprocess.PIPE, shell=True) self.p.wait() def term(self): print 'terminating', self.p.pid os.system("kill -f %d" % self.p.pid) if __name__ == "__main__": base_dir = "e:/shows" dirs, files = ls(base_dir) files = [ x[0] for x in files if ".avi" in x[0] or "mkv" in x[0] or "mp4" in x[0] ] fout = open("%s/%s" % (os.environ['TEMP'], logfile), "a") while (True): seed = random.choice(range(10000)) rnd = twister.get_random_numbers(seed, 1) print len(files), 'files' rnd = rnd[0] % len(files) file = files[rnd] file = file.replace('/', '\\') fout.write(file) fout.write("\n")
def index(crawl_dir,index_dir,get_first_N=None): df_files = []; file_indexed_dict = {} ; index = None if os.path.isfile(index_dir + "/loogle_files.csv" ): df = pd.read_csv(index_dir + "/loogle_files.csv") df_files = df[['file','size']].values.tolist() file_indexed_dict = df.set_index('file').to_dict()['size'] index = open_dir(index_dir) else: pdf_schema = Schema(path = ID(stored=True), title = TEXT(stored=True), text = TEXT) index = create_in(index_dir, pdf_schema) writer = index.writer() # get all potential files to be indexed dirs, files = rsync.ls(crawl_dir) files = [(f.replace("\\","/"),size) for (f,size) in files ] files = [(f,size) for (f,size) in files if os.path.splitext(f)[1] in exts] files_crawled_dict = dict(files) tmp = {} # needed bcz cannot change file_indexed_dict while iterating for ff in file_indexed_dict: # remove file from index if file exists in index, but not on # file system if ff not in files_crawled_dict: print ff, 'removed' writer.delete_by_term('path', unicode(ff)) tmp[ff] = file_indexed_dict[ff] elif files_crawled_dict[ff] != file_indexed_dict[ff]: # this is the only section we do not add to tmp this is # how I remove an updated file from my index dictionary so # its absence will be detected below, and will be freshly # reindexed print ff, 'size different update' writer.delete_by_term('path', unicode(ff)) else: tmp[ff] = file_indexed_dict[ff] # put it back in file_indexed_dict = tmp if get_first_N: files = files[:get_first_N] for i,(file,size) in enumerate(files): if file in file_indexed_dict: print 'skipping', file continue print 'processing', file ext = os.path.splitext(file)[1] if ext == ".pdf" : cmd = pdfcmd % (file,os.environ['TEMP']) os.system(cmd) elif ext == ".djvu": cmd = djvucmd % (file,os.environ['TEMP']) os.system(cmd) os.system("unix2dos -7 %s/loog.txt" % os.environ['TEMP']) elif ext == ".html": with codecs.open(file, encoding='utf-8') as f: content = f.read() content = webarticle2text.extractFromHTML(content) fout = open("%s/loog.txt" % os.environ['TEMP'],"w") fout.write(content.encode("utf8")) fout.close() elif ext == ".txt": shutil.copy(file, "%s/loog.txt" % os.environ['TEMP']) # turn the file name itself into content as well just in case, # if text conversion does not output anything, at least we can # use some information from the file name for search filename_as_content = os.path.basename(file).replace("_"," ").replace("-"," ") filename_as_content = filename_as_content.decode("latin-1") for x in exts: filename_as_content = filename_as_content.replace(x,"") filename_as_content += " " with codecs.open("%s/loog.txt" % os.environ['TEMP'], encoding='utf-8') as f: content = f.read() writer.add_document(path = unicode(file), title = unicode(filename_as_content), text = unicode(content)) df_files.append([file, size]) writer.commit() df_files = pd.DataFrame(df_files,columns=['file','size']) df_files.to_csv(index_dir + "/loogle_files.csv",index=None)
# Plays mp3 files found under sys.argv[1] one by one, randomly. # Meant to simulate a radio. import pyaudio, struct import glob, os, random, sys import threading, numpy as np import datetime, random from rsync import ls import select, rndplay fout = open("/tmp/vidplay.out","w") while True: print "Music Dir", sys.argv[1] dirs,list = ls(sys.argv[1]) print "Files", len(list) idx = rndplay.my_random(len(list)) print "show idx selected", idx, "song", list[idx][0] fout.write(str(list[idx][0]) + "\n") fout.flush() print '\n' #cmd = "/usr/bin/ffplay -nodisp '%s'" % list[idx] cmd = "mplayer '%s' -fs " % list[idx][0] print cmd os.system(cmd) print "Delete? (Press d for delete)..." k="" def input(): global k i = 0 while i < 1: i = i + 1
threading.Thread.__init__(self) self.file = file def run(self): print self.file self.p = subprocess.Popen([exe, "-autoexit", "-fs", "%s" % file], stdout=subprocess.PIPE, shell=True) self.p.wait() def term(self): print 'terminating', self.p.pid os.system("kill -f %d" % self.p.pid) if __name__ == "__main__": base_dir = "/media/burak/New Volume/shows" dirs,files = ls(base_dir) files = [x[0] for x in files if ".avi" in x[0] or "mkv" in x[0] or "mp4" in x[0]] fout = open("%s/%s" % (os.environ['TEMP'],logfile), "a") while (True): rnd = random.choice(range(len(files))) print len(files), 'files' file = files[rnd] file = file.replace('/','\\') fout.write(file) fout.write("\n") fout.flush() t = Runner(file) t.start() while True:
# Plays mp3 files found under sys.argv[1] one by one, randomly. # Meant to simulate a radio. import glob, os, random, sys import threading import select from rsync import ls fout = open("/tmp/vidplay.out","w") while True: print "Music Dir", sys.argv[1] dirs,list = ls(sys.argv[1]) idx = random.choice(range(len(list))) print "show idx selected", idx, "song", list[idx][0] fout.write(str(list[idx][0]) + "\n") fout.flush() print '\n' #cmd = "/usr/bin/ffplay -nodisp '%s'" % list[idx] cmd = "mplayer '%s' -fs " % list[idx][0] print cmd os.system(cmd) print "Delete? (Press d for delete)..." k="" def input(): global k i = 0 while i < 1: i = i + 1 r,w,x = select.select([sys.stdin.fileno()],[],[],2) if len(r) != 0: k =sys.stdin.readline()
def index(crawl_dir, index_dir, get_first_N=None): df_files = [] file_indexed_dict = {} index = None if os.path.isfile(index_dir + "/loogle_files.csv"): df = pd.read_csv(index_dir + "/loogle_files.csv") df_files = df[['file', 'size']].values.tolist() file_indexed_dict = df.set_index('file').to_dict()['size'] index = open_dir(index_dir) else: pdf_schema = Schema(path=ID(stored=True), title=TEXT(stored=True), text=TEXT) index = create_in(index_dir, pdf_schema) writer = index.writer() # get all potential files to be indexed dirs, files = rsync.ls(crawl_dir) files = [(f.replace("\\", "/"), size) for (f, size) in files] files = [(f, size) for (f, size) in files if os.path.splitext(f)[1] in exts] files_crawled_dict = dict(files) tmp = {} # needed bcz cannot change file_indexed_dict while iterating for ff in file_indexed_dict: # remove file from index if file exists in index, but not on # file system if ff not in files_crawled_dict: print ff, 'removed' writer.delete_by_term('path', unicode(ff)) tmp[ff] = file_indexed_dict[ff] elif files_crawled_dict[ff] != file_indexed_dict[ff]: # this is the only section we do not add to tmp this is # how I remove an updated file from my index dictionary so # its absence will be detected below, and will be freshly # reindexed print ff, 'size different update' writer.delete_by_term('path', unicode(ff)) else: tmp[ff] = file_indexed_dict[ff] # put it back in file_indexed_dict = tmp if get_first_N: files = files[:get_first_N] for i, (file, size) in enumerate(files): if file in file_indexed_dict: print 'skipping', file continue print 'processing', file ext = os.path.splitext(file)[1] if ext == ".pdf": cmd = pdfcmd % (file, os.environ['TEMP']) os.system(cmd) elif ext == ".djvu": cmd = djvucmd % (file, os.environ['TEMP']) os.system(cmd) os.system("unix2dos -7 %s/loog.txt" % os.environ['TEMP']) elif ext == ".html": with codecs.open(file, encoding='utf-8') as f: content = f.read() content = webarticle2text.extractFromHTML(content) fout = open("%s/loog.txt" % os.environ['TEMP'], "w") fout.write(content.encode("utf8")) fout.close() elif ext == ".txt": shutil.copy(file, "%s/loog.txt" % os.environ['TEMP']) # turn the file name itself into content as well just in case, # if text conversion does not output anything, at least we can # use some information from the file name for search filename_as_content = os.path.basename(file).replace("_", " ").replace( "-", " ") filename_as_content = filename_as_content.decode("latin-1") for x in exts: filename_as_content = filename_as_content.replace(x, "") filename_as_content += " " with codecs.open("%s/loog.txt" % os.environ['TEMP'], encoding='utf-8') as f: content = f.read() writer.add_document(path=unicode(file), title=unicode(filename_as_content), text=unicode(content)) df_files.append([file, size]) writer.commit() df_files = pd.DataFrame(df_files, columns=['file', 'size']) df_files.to_csv(index_dir + "/loogle_files.csv", index=None)