Exemple #1
0
 def queryAlexa(self, url):
   php_url = "http://localhost/ad_detect/get_url_category.php";
   php_url += '?site=' + urllib2.quote(url);
   try:
     response = urllib2.urlopen(php_url);
     html = response.read();
     ret = json.loads(html)['Response']['UrlInfoResult']['Alexa']['Related']['Categories']['CategoryData'];
     # Check whether it's a single Top/World category
     if type(ret) == dict:
       path = ret['AbsolutePath'];
       refined_path = '/'.join(path.split('/')[:3]);
       if 'World' in refined_path or 'Region' in refined_path:
         return {};
     if type(ret) == list:
       empty = True;
       for i in range(len(ret)):
         path = ret[i]['AbsolutePath'];
         refined_path = '/'.join(path.split('/')[:3]);
         # Ignore region-based categories
         if not 'World' in refined_path and not 'Region' in refined_path:
           empty = False;
       if empty:
         return {};
     return {'source':'Alexa', 'category':ret};
   except:
     return {}
Exemple #2
0
def fix_duplicates(duplicate_chunks):
    for path in duplicate_chunks:

        # deconstruct relevant information from chunk path, clean it
        path_components = path.split("/")
        if len(path_components) == 5:
            _, study_obj_id, username, data_stream, timestamp = path.split("/")
        elif len(path_components) == 4:
            study_obj_id, username, data_stream, timestamp = path.split("/")
        else:
            print(
                "You appear to have an invalid file path.  Please report this error to https://github.com/onnela-lab/beiwe-backend/issues"
            )
            raise Exception("invalid_path: %s" % path)

        # not all files are chunkable, they will require different logic.
        if data_stream not in CHUNKABLE_FILES:
            remove_all_but_one_chunk(path)
            continue
        else:
            try:
                FileToProcess.reprocess_originals_from_chunk_path(path)
            except Exception as e:
                if "did not find any matching files" in str(e):
                    pass
                else:
                    raise
            remove_all_but_one_chunk(path)
Exemple #3
0
 def getPageCategory(self, url):
   result = self.getPageRawCategory(url);
   if result == {}:
     return {};
   ret = {'source':'','category':[]};
   # Two levels for Alexa (e.g. Top/Shopping/Music)
   if result['source'] == 'Alexa':
     ret['source'] = 'Alexa';
     try:
       if type(result['category']) == list:
         for i in range(len(result['category'])):
           path = result['category'][i]['AbsolutePath'];
           refined_path = '/'.join(path.split('/')[:3]);
           # Ignore region-based categories
           if 'World' in refined_path or 'Region' in refined_path:
             continue;
           if not refined_path in ret['category']:
             ret['category'].append(refined_path);
       elif type(result['category']) == dict:
         path = result['category']['AbsolutePath'];
         refined_path = '/'.join(path.split('/')[:3]);
         ret['category'].append(refined_path);
     except:
       self.stats.increment('Category detection failed', 1);
       print 'ERROR PARSING:',result;
   # Ignore all scores for Yahoo
   elif result['source'] == 'Yahoo':
     ret['source'] = 'Yahoo';
     try:
       for i in range(len(result['category'])):
         cat = result['category'][i]['category'];
         if not cat in ret['category']:
           ret['category'].append(cat);
     except:
       self.stats.increment('Category detection failed', 1);
       print 'ERROR PARSING:',result;
   # Alchemy
   elif result['source'] == 'Alchemy':
     ret['source'] = 'Alchemy';
     ret['category'] = [result['category']];
   # Bluecoat
   elif result['source'] == 'Bluecoat':
     ret['source'] = 'Bluecoat';
     try:
       for i in range(len(result['category'])):
         cat = result['category'][i];
         if not cat in ret['category']:
           ret['category'].append(cat);
     except:
       self.stats.increment('Category detection failed', 1);
       print 'ERROR PARSING:',result;
   else:
     ret = {};
   if ret != {} and ret['source'] != '':
     self.stats.increment('Category detection succeeded', 1);
     ret['mapped_category'] = self.mapCategory(ret['source'], ret['category']);
   else:
     self.stats.increment('Category detection failed', 1);
   return ret;
Exemple #4
0
 def translate_path(self, path):
     """ Override to handle redirects.
     """
     path = path.split('?', 1)[0]
     path = path.split('#', 1)[0]
     path = normpath(unquote(path))
     words = filter(lambda a: a != '' and a not in (os.curdir, os.pardir),
                    path.split('/'))
     return os.path.join(self.serve_path, *words)
def which(program, use_secure_path=False, options=None):
    """Searches the environment PATH (or an hard-coded 'secure' path) for an executable with the given name."""
    def is_exe(fpath):
        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

    fpath, name = os.path.split(program)
    if fpath:
        if is_exe(program):
            if options:
                program += " " + options
            return program
    else:
        if use_secure_path:
            path = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
        else:
            path = os.environ["PATH"]

        for part in path.split(os.pathsep):
            part = part.strip('"')
            exe_file = os.path.join(part, program)
            if is_exe(exe_file):
                if options:
                    exe_file += " " + exe_file
                return exe_file

    return None
Exemple #6
0
    def stidy(structure, ang, d1, d2, d3):
        PLATON = find_executable('platon')
        if not PLATON:
            PLATON = '../bin/platon'

        with NamedTemporaryFile(suffix='.cif') as temp_file:
            # write temporary cif file
            CifWriter(structure).write_file(temp_file.name)
            temp_file.flush()
            # run ADDSYM_SHX to make PLATON recognize symmetries
            addsym_shx_process = Popen(['platon', '-o', temp_file.name],
                                       stdout=PIPE,
                                       stderr=STDOUT,
                                       stdin=PIPE)
            try:
                addsym_shx_process.communicate(
                    input=b'ADDSYM_SHX {} {} {} {}'.format(ang, d1, d2, d3))
            except TimeoutExpired as t:
                return ExitCode(408, 'ADDSYM_SHX timed out: {}'.format(t))
            except Exception as e:
                return ExitCode(500, 'ADDSYM_SHX crashed: {}'.format(e))
            # call STIDY on the ADDSYM_SHX output
            temp_file_dirname, temp_file_basename = path.split(temp_file.name)
            temp_file_basename_extless, _ = path.splitext(temp_file_basename)
            temp_file_basename_spf = temp_file_basename_extless + '_pl.spf'
            temp_file_spf = path.join(temp_file_dirname,
                                      temp_file_basename_spf)

            if not os.path.isfile(temp_file_spf):
                return ExitCode(500,
                                'ADDSYM_SHX failed to write *_pl.spf file')

            stidy_process = Popen(['platon', '-o', temp_file_spf],
                                  stdout=PIPE,
                                  stderr=STDOUT,
                                  stdin=PIPE)
            try:
                stidy_data = stidy_process.communicate(input=b'STIDY')
            except TimeoutExpired as t:
                return ExitCode(408, 'STIDY timed out: {}'.format(t))
            except Exception as e:
                return Exitcode(500, 'STIDY crashed: {}'.format(e))
        stidy_output = stidy_data[0].decode('utf-8')

        # clean up files
        if path.isfile('check.def'):
            remove('check.def')

        return stidy_output
Exemple #7
0
def process_inputs():
    dir = 'C:/Workspace/Bills/input'
    ext = '*-raw.txt'
    inputs = list()
    dirs = ['train', 'test']
    for d1 in dirs:
        files = glob(join(dir, d1, ext))
        for f in files:
            d = set([])
            if path.isfile(f) == True:
                txtfile = open(f).readlines()
                raw = list()
                for line in txtfile:
                    emails = regex_email.findall(line)
                    if len(emails):
                        for email in emails:
                            raw.append(email)
                    else:
                        flag, txt = process_txt(line)
                        if flag:
                            if len(txt) > 2:
                                raw.append(txt)
                if len(raw) > 0:
                    for sentence in raw:
                        emails = regex_email.findall(sentence)
                        if len(emails):
                            words = emails
                        else:
                            words = nltk.word_tokenize(sentence)
                        d = d | set(words)
            sd = set(sorted(d))
            vocab = set([])
            porter = nltk.PorterStemmer()
            for stemming in sd:
                if not is_key_word(stemming):
                    stemmed_word = porter.stem(stemming)
                else:
                    stemmed_word = stemming
                vocab.add(stemmed_word)
            head, tail = path.split(f)
            find_idx = tail.rfind('.txt')
            if find_idx != -1:
                s = tail.replace('-raw.txt', '-input.txt')
            txtfilepath = path.join(head, s)
            txtf = open(txtfilepath, 'w+')
            for item in vocab:
                txtf.write(item)
                txtf.write('\n')
            txtf.close()
Exemple #8
0
def get_CParent_to_root_path_node_names(parse_dict,docID,sentID,conn_indices):
    parse_tree = parse_dict[docID]["sentences"][sentID]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)

    if syntax_tree.tree == None:
        path = "NONE_TREE"
    else:
        path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            conn_parent_node = conn_node.up
            path += syntax_tree.get_node_path_to_root(conn_parent_node) + "-->"
        if path[-3:] == "-->":
            path = path[:-3]
    return path.split("-->")
Exemple #9
0
def get_last_checkpoint(folder, modelasr_name):
    content = os.listdir(folder)
    # print(content)
    checkpoints = []

    for path in content:

        if str(path.split('.')[0]).startswith(modelasr_name):
            checkpoints.append(path)

    if len(checkpoints) == 0:
        return
    _re_checkpoint = re.compile(r'(\d+)')
    max_checkpoint = max(
        checkpoints, key=lambda x: int(_re_checkpoint.search(x).groups()[0]))
    step_num = int(_re_checkpoint.search(max_checkpoint).groups()[0])

    return os.path.join(folder, max_checkpoint), step_num
Exemple #10
0
def response_path(path):
    """    This method should return appropriate content and a mime type. 
    TOOD:
    get mime types aligned
    make PNG work
       """

    content = ''
    try:
        if path == '/':
            content = "{0}".format('\n'.join(
                [x for x in os.listdir('.\\webroot')]))
        else:
            file = open(Path.cwd().joinpath('webroot', path[1:]))
            content = file.read()
            file.close()
        '''
        #this seems weird to have an error raise an error, but I'm following the todo 
        form line 116 in the instructions 'If response_path raised # a NameError, then 
        let response be a not_found response.'
        '''

        file_ext = path.split(".")
        file_type = ''.join(file_ext[-1:])

        if file_type == 'html':
            mime_type = b"text/html"
        if file_type == 'h':
            mime_type = b"text/html"
        if file_type == 'txt':
            mime_type = b"text/plain"
        if file_type == 'png':
            mime_type = b"image/apng"
        if file_type == "jpeg":
            mime_type = b"image/jpeg"
        else:
            mime_type = b"text/plain"
    except FileNotFoundError:
        raise NameError

    # mime_type = mime_type.encode()
    content = content.encode()

    return content, mime_type
 def handleRequest(self, path, arguments, **kwargs):
     fileargs = dict(case=path.split('/')[-1])
     for k in ['verb', 'resumptionToken', 'metadataPrefix', 'set', 'from']:
         fileargs[k] = arguments[k][0] if k in arguments else ''
     if fileargs['resumptionToken']:
         filename = '{case}-{verb}-{resumptionToken}.xml'.format(**fileargs)
     else:
         filename = '{case}-{verb}-{metadataPrefix}-{set}-{from}.xml'.format(
             **fileargs)
     filepath = join(testdataDir, filename)
     if isfile(filepath):
         yield 'HTTP/1.0 200 Ok\r\nContent-Type: text/xml; charset=utf-8\r\n\r\n'
         with open(filepath) as fp:
             yield fp.read()
     else:
         yield 'HTTP/1.0 400 Bad Request\r\nContent-Type: text/plain; charset=utf-8\r\n\r\n'
         l = 'Error: file {} not found.'.format(repr(filepath))
         print(l)
         yield l
Exemple #12
0
 def process_vocab(self, vera_path):
     #Accerss all the vera.ai files under the given path and generate the superset - vocab
     vocab_path = join(vera_path, 'vocab.vera')
     vocab = set([])
     for f in glob(path.join(vera_path, '*-vera.ai')):
         #for each vera.ai file create a set of words
         lines = open(f).readlines()
         if len(lines) > 0:
             vocab = vocab | set(lines)
     if path.exists(vocab_path):
         head, tail = path.split(vocab_path)
         tail = tail.replace(tail.rfind('.'), '-backup.')
         if path.exists(path.join(head, tail)):
             os.unlink(path.join(head, tail))
         os.rename(vocab_path, path.join(head, tail))
     hf = open(vocab_path, 'w')
     for item in vocab:
         hf.write(item)
     hf.close()
Exemple #13
0
def process_vocab():
    dir = 'C:/Workspace/Bills/input'
    ext = '*-input.txt'
    d = set([])
    dirs = ['train', 'test']
    for d1 in dirs:
        vocab = set([])
        files = glob(join(dir, d1, ext))
        for f in files:
            d = set([])
            if path.isfile(f) == True:
                txtfile = open(f).readlines()
                raw = list()
                for line in txtfile:
                    raw.append(line)
                if len(raw) > 0:
                    d = d | set(raw)
            vocab = vocab | d
        head, tail = path.split(f)
        txtfilepath = join(head, 'Respicio-pp.txt')
        txtf = open(txtfilepath, 'w+')
        for item in vocab:
            txtf.write(item)
        txtf.close()
Exemple #14
0
def run():
    ext = '*.jpg'
    dir = 'C:/Workspace/Bills'
    ext_txt = 'txt'
    files = glob(join(dir, 'image', ext))
    txtdir = path.join(dir, 'input')
    for f in files:
        if path.isfile(f) == True:
            img = Image.open(f)
            txt = pytesseract.image_to_string(img)
            head, tail = path.split(f)
            find_idx = tail.rfind('.jpg')
            new_tail = tail
            if find_idx != -1:
                new_tail = tail.replace('.jpg', '-raw.txt')
            txtfilepath = path.join(txtdir, new_tail)
            flag, raw = process_txt(txt)
            if flag:
                txtf = open(txtfilepath, 'w+')
                for i in raw:
                    txtf.write(i)
                    txtf.write('\n')
                txtf.close()
    err = 0
Exemple #15
0
async def Handle_Pcap(path):

    files_SendResquest = []
    task_SendRequest = []
    # extract http, ftp
    query1 = "tcpflow -r " + active_File + " -o " + path + " -e http"
    subprocess.check_output(query1, shell=True)

    # extract SMB
    list_SMB, list_Task = Export_SMB2()
    if (len(list_SMB) > 0):
        files_SendResquest.extend(list_SMB)
        task_SendRequest.extend(list_Task)
    list = os.listdir(path)

    markFtp = []
    # remove file unuse
    for index in list:
        fullPath = os.path.join(path, index)
        mime_Type = mime.from_file(fullPath)
        check = Check_UnFile(mime_Type)

        # Add find Ftp
        if (mime_Type == "text/plain"):
            if (index.find(portFtp) != -1):
                markFtp.append(index)

        elif (mime_Type in deny_MimiType or check == 1):
            os.remove(fullPath)

    list = os.listdir(path)

    # handle file
    for index in range(len(list)):
        no_FullPath = os.path.join(path, list[index])
        mime_Type = mime.from_file(no_FullPath)

        if (mime_Type != 'text/plain'):
            #print(fullPath + "\n")
            ValidIpAddressRegex = r'''(25[0-5]|2[0-4][0-9]|[0-1]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[0-1]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[0-1]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[0-1]?[0-9][0-9]?)'''
            check = re.search(ValidIpAddressRegex, list[index])
            if (check is not None):
                fullPath, task = Parse_FileName(list[index], path, markFtp)
                if fullPath is None:
                    continue
            if (fullPath == ''):
                os.remove(no_FullPath)
                continue

            if (mime_Type == 'application/zip'
                    or mime_Type == 'application/x-7z-compressed'
                    or mime_Type == 'application/x-rar-compressed'):
                try:
                    extract_Files, extract_Tasks = Extract_FileCompressed(
                        fullPath, task)
                    files_SendResquest.extend(extract_Files)
                    task_SendRequest.extend(extract_Tasks)
                except:
                    print('not Extract file zip : ' + fullPath)
            else:
                file = Static_Analyst(fullPath, task)
                #print(fullPath + "\n")

                if (file != ''):
                    task = task.obj_dict()
                    task_SendRequest.append(task)
                    files_SendResquest.append(file)

    history = path.split("/")[-1] + ".log"
    logging.basicConfig(level=logging.DEBUG,
                        filename=os.path.join('Log', history),
                        format='%(asctime)s %(levelname)s:%(message)s')
    logger = logging.getLogger(__name__)
    logger.info(" -------  \t File Send Dynamic  \t  ------  \n ")
    logger.info("Count: %d", len(files_SendResquest))

    print(" -------  \t File Send Dynamic  \t  ------  \n ")
    print("Count: %d", len(files_SendResquest))
    #print("Send Count: " + str(len(files_SendResquest)))
    for i in files_SendResquest:
        print(i)
        logger.info(i)

    logger.info(" -------  \t Finish \t  ------  \n ")
    print(" -------  \t Finish \t  ------  \n ")

    await Dynamic_Analyst(files_SendResquest, task_SendRequest, logger)
Exemple #16
0
    for i in range(ids.shape[1]):
        gcam.backward(ids[:, [i]])
        regions = gcam.generate(target_layer=layer)
        for j in range(len(images)):
            print(f"#{j}: {classes[ids[j, i]]} ({probs[j, i]:.5f})")
            # Grad-CAM
            raw_image = imread(paths[j])
            combined = combine_image_and_gcam(regions[j, 0], raw_image)
            processed_images[j].append(combined.astype(np.uint8))

    for j, (image_list, path) in enumerate(zip(processed_images, paths)):
        plt.figure(figsize=(16, 4))
        for i, image in enumerate(image_list):
            plt.subplot(1,
                        len(image_list),
                        i + 1,
                        xticks=[],
                        yticks=[],
                        frameon=False)

            c, p, t = classes[ids[j, i]], 100 * probs[j, i], bool(labels[j, i])
            plt.title(f"{c} {p:.0f}% ({t})", fontsize=10)
            plt.imshow(image)

        plt.tight_layout()

        filename = '-'.join(path.split('/')[-3:])
        filename = splitext(filename)[0] + '.png'
        plt.savefig(join(opt.output, filename))
        plt.clf()
Exemple #17
0
import time
from mpi4py import MPI
import h5py
import logging

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
cpus = comm.Get_size()

ts = time.clock()

with open("/home/hklee/work/envs/envs.dat", "r") as f:
    contents = f.readlines()
for path in contents:
    if "total_path" in path:
        total_path = path.split("=")[1]
    elif "result" in path:
        result_path = path.split("=")[1]
    elif "parameter" in path:
        para_path = path.split("=")[1]
    elif "log" in path:
        log_path = path.split("=")[1]

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logfile = log_path + '%d_log.dat' % rank

lf = logging.FileHandler(logfile, 'w')
form = logging.Formatter('%(asctime)s - %(message)s')
lf.setFormatter(form)
logger.addHandler(lf)
Exemple #18
0
from mpi4py import MPI
import warnings

# to stack the shear catalogs of each exposure into a file

warnings.filterwarnings("error")

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
cpus = comm.Get_size()

with open("/home/hkli/work/envs/envs.dat", "r") as f:
    contents = f.readlines()
for path in contents:
    if "cfht_data_path" in path:
        data_path = path.split("=")[1]
    elif "cfht_field_path" in path:
        field_path = path.split("=")[1]

cfht_dict, fields = tool_box.field_dict(data_path + "nname.dat")
field_pool = tool_box.allot(fields, cpus)

for field in field_pool[rank]:
    expos = list(cfht_dict[field].keys())
    f_path = field_path + field + "/"
    for expo in expos:
        i = 0
        for chip in cfht_dict[field][expo]:
            dat_path = data_path + "%s/result/%s_shear.dat" % (field, chip)
            try:
                temp = numpy.loadtxt(dat_path, skiprows=1)
cut = argv[1]

g1num = cpus - 6
g2num = cpus
g1 = numpy.linspace(-0.004, 0.004, g1num)
g2 = numpy.linspace(-0.0055, 0.0055, g2num)
dg1 = g1[1] - g1[0]
dg2 = g2[1] - g2[0]

t1 = time.clock()
with open("%s/work/envs/envs.dat" % my_home, "r") as f:
    contents = f.readlines()
for path in contents:
    if "cfht_data_path" in path:
        data_path = path.split("=")[1]
    elif "cfht_res_path" in path:
        result_path = path.split("=")[1]
    elif "cfht_pic_path" in path:
        pic_path = path.split("=")[1]
    elif "cfht_cut_path" in path:
        cut_path = path.split("=")[1]

fq = Fourier_Quad(48, 123)

g1_data_path = result_path + "g1_%d.npz" % rank
g2_data_path = result_path + "g2_%d.npz" % rank

g1num = cpus - 6
g2num = cpus
g1 = numpy.linspace(-0.004, 0.004, g1num)
# to find the binary on the each source chip. it will save the binary label for each source.
# '1' means binary
# if the command input is 'find', it will find the binaries
# if 'stack' is input, it will stack the existing binary label files.

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
cpus = comm.Get_size()

ts = time.clock()

with open("%s/work/envs/envs.dat" % my_home, "r") as f:
    contents = f.readlines()
for path in contents:
    if "cfht_data_path" in path:
        data_path = path.split("=")[1]
    elif "cfht_res_path" in path:
        result_path = path.split("=")[1]
    elif "cfht_pic_path" in path:
        pic_path = path.split("=")[1]
    elif "cfht_field_path" in path:
        field_path = path.split("=")[1]

size = 48
fq = Fourier_Quad(size, 123)

nname_path = data_path + "nname.dat"
field_dict, fields = tool_box.field_dict(nname_path)
r_fields = tool_box.allot(fields, cpus)[rank]

# for the stacking process
Exemple #21
0
import numpy
import time
import os
import matplotlib.pyplot as plt
import tool_box
import copy

data_name, g1num, g2num, bin_num, thresh = argv[1], argv[2], argv[3], argv[
    4], float(argv[5])
g1num, g2num, bin_num = int(g1num), int(g2num), int(bin_num)

with open("/home/hkli/work/envs/envs.dat", "r") as f:
    contents = f.readlines()
for path in contents:
    if "cfht_data_path" in path:
        total_path = path.split("=")[1]
    elif "cfht_res_path" in path:
        result_path = path.split("=")[1]

field_path = result_path + "field/"

nname_path = total_path + "nname.dat"
all_fields = tool_box.field_dict(nname_path)[1]

filter_path = result_path + "field/filtered.dat"
filter_exist = os.path.exists(filter_path)

fq = Fourier_Quad(48, 123)
cache_path = result_path + data_name
print(cache_path)
arr = numpy.load(cache_path)["arr_1"]