def CreateDataForLDA(Location,fileName=None):
    # Check if the folder that user wants to use already exists
    TotalFilesToProcess = 0
    DataHolder = []
    tokenizer = WordPunctTokenizer() 
    english_stops = set(stopwords.words('english')) 
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    if os.path.exists(Location):
        print "The Folder you specified already exists, all the files those you want to use should be here"
    else:
        os.mkdir(Location)
        print "Folder got created"

    for eachFileInFolder in glob.iglob(Location+"*.txt"):
        TotalFilesToProcess= TotalFilesToProcess + 1
        print "Now Processing the File : " + eachFileInFolder
        FileContents = Path(eachFileInFolder).read_text()
        tokens = tokenizer.tokenize(FileContents.encode('ascii',errors='ignore'))
        StopWordsRemovedText = [word for word in tokens if word not in english_stops]
        words=""
        for idx, word in enumerate(StopWordsRemovedText):
            words = words + " " +stemmer.stem(lemmatizer.lemmatize(word))
        DataHolder.append(words)
    print "Hello, basic stemming and stop word removal is done"
    return(DataHolder)
Example #2
0
def upload(request):
    # Handle file upload
    if request.method == 'POST':
        form = DocumentForm(request.POST, request.FILES)
        if form.is_valid():

            my_file = Paths("/home/fernando/Desktop/myproject/media/samples/" +
                            request.FILES['docfile'].name)
            if my_file.is_file():
                os.remove("/home/fernando/Desktop/myproject/media/samples/" +
                          request.FILES['docfile'].name)  # remove duplicates
            newdoc = Document(docfile=request.FILES['docfile'])
            newdoc.save()
            my_file = '/home/fernando/Desktop/myproject/media/samples/' + request.FILES[
                'docfile'].name
            pathtofile = my_file.encode('ascii', 'ignore')
            subprocess.call(
                '/home/fernando/Desktop/myproject/myproject/myapp/submitsample.sh '
                + pathtofile,
                shell=True)
            time.sleep(5)
            latest_file_sha = max([
                os.path.join('/home/fernando/.cuckoo/storage/binaries', d)
                for d in os.listdir('/home/fernando/.cuckoo/storage/binaries')
            ],
                                  key=os.path.getmtime)
            latest_file_sha = os.path.basename(latest_file_sha)
            latest_dir_analisys = os.path.basename(
                os.readlink('/home/fernando/.cuckoo/storage/analyses/latest'))
            filename = request.FILES['docfile'].name
            dirpath = '/home/fernando/.cuckoo/storage/analyses/' + str(
                (int(latest_dir_analisys) + 1)) + '/'
            newinfo = Path(namefile=filename,
                           idsha256=latest_file_sha,
                           pathto=dirpath)  # Ok
            newinfo.save()
            print newinfo.idsha256
            # analisysinfo = Path.objects.create(namefile=filename,idsha256=latest_file_sha,pathto=dirpath)
            return render(
                request, 'submited.html', {
                    'latest_file_sha': latest_file_sha,
                    'dirpath': dirpath,
                    'filename': filename
                })

            # Redirect to the document list after POST
            # return HttpResponseRedirect(reverse('upload'))
    else:
        form = DocumentForm()  # A empty, unbound form

    # Render list page with the documents and the form
    return render(request, 'upload.html', {'form': form})
Example #3
0
def main():
    log("INFO: starting application...")

    #//
    def get_callback(fut, file):
        def callback(fut):
            try:
                log("INFO: response received for " + file + " " + fut.result())
                os.rename(os.path.join(fld_pro, file),
                          os.path.join(fld_arc, file))
                futures.pop(file)
            except:
                log('ERROR: error processing file {} for {}.'.format(
                    fut.exception(), file))
                os.rename(os.path.join(fld_pro, file),
                          os.path.join(fld_err, file))
                futures.pop(file)

        return callback

    #//
    while True:
        #// ARE THERE ANY NEW FILES? MOVE THEM TO PROCESSING.
        for file in os.listdir(fld_src):
            if file.startswith(fle_prefix):
                log('INFO: found new file ' + file)
                os.rename(os.path.join(fld_src, file),
                          os.path.join(fld_pro, file))
        time.sleep(.100)
        #// PROCESS FILES
        for file in os.listdir(fld_pro):
            if file.startswith(fle_prefix):
                if file not in futures:
                    futures.update({
                        file: None
                    })  #// ADD THE FILE NAME TO THE EXPECTED RESPONSE QUEUE
                    msg = Path(os.path.join(fld_pro, file)).read_text()
                    future = publisher.publish(topic_path,
                                               data=msg.encode('utf-8'))
                    future.add_done_callback(get_callback(future, file))
                    log("INFO: sent publish request for " + file)
        #// RETRY ERROR FILES
        #// CLEAN UP ARCHIVES
        time.sleep(5)