Exemple #1
0
def recoverUnknown(f_unknown_p, f_unknown, f_align_p, f_clean):
    """ Shows us the Foreign sentence that produced no formality while the English sentence had a "you". """
    print "Recovering unknown sentences"

    unknown = loadStruct(f_unknown_p)
    align = loadStruct(f_align_p)

    with copen(f_unknown, "w", encoding="utf-8") as unknown_f:
        for doc, proj in unknown.iteritems():
            if len(proj) > 0:
                de = []
                links = align[doc]

                for p in proj:
                    for link in links:
                        if p in link[1].split(" "):
                            de.extend(link[0].split(" "))

                with copen(f_clean + doc[0].replace(".gz", "")) as doc_f:
                    dom = parse(doc_f)
                    nodes = dom.getElementsByTagName("s")

                    for node in nodes:
                        if node.getAttribute("id") in de:
                            unknown_f.write("%s\n" % node.firstChild.nodeValue)
Exemple #2
0
def recoverUnknown(f_unknown_p, f_unknown, f_align_p, f_clean):
    """ Shows us the Foreign sentence that produced no formality while the English sentence had a "you". """
    print "Recovering unknown sentences"

    unknown = loadStruct(f_unknown_p)
    align = loadStruct(f_align_p)

    with copen(f_unknown, "w", encoding="utf-8") as unknown_f:
        for doc, proj in unknown.iteritems():
            if len(proj) > 0:
                de = []
                links = align[doc]

                for p in proj:
                    for link in links:
                        if p in link[1].split(" "):
                            de.extend(link[0].split(" "))

                with copen(f_clean + doc[0].replace(".gz", "")) as doc_f:
                    dom = parse(doc_f)
                    nodes = dom.getElementsByTagName("s")

                    for node in nodes:
                        if node.getAttribute("id") in de:
                            unknown_f.write("%s\n" % node.firstChild.nodeValue)
Exemple #3
0
    def init(self):
        """
        Internal, make the dirs and touch the files
        """
        pexists = os.path.exists
        pjoin = os.path.join
        #if not pexists(".htaccess"):
        #    with copen(".htaccess", "w") as f:
        #        f.write("DirectoryIndex %s" % argv[0])
        for dir in [d for o, d in self.config.items("blog") if
                o.endswith("directory")]:
            if not pexists(dir):
                os.makedirs(dir, mode=(0770 if not "captcha" in dir else 0771))
                with copen(pjoin(dir, ".htaccess"), "w") as f:
                    f.write("Order Deny, Allow\nDeny from All")

        for index in [getattr(self, a) for a in self.__dict__ if
                a.endswith("_index")]:
            if not pexists(index):
                with copen(index, "w"): pass
                os.chmod(index, 0640)
        if pexists(self.init_file):
            with copen(self.init_file) as f:
                lines = f.readlines()
            try:
                result = [self.add_user(*l.strip("\n").split(":")) for l in
                          lines]
            except TypeError:
                pass
            if result:
                with copen("%s.log" % self.init_file, "w") as f:
                    f.writelines(result)
                os.remove(self.init_file)
Exemple #4
0
def processGutenberg(f_gutenberg, f_gproj):
    """
    Processing the Project Gutenberg corpus.
    """
    for f_g in ["test/", "train/"]:
        createPath(f_gproj + f_g)

        for f_novel in listdir(f_gutenberg + f_g):
            if f_novel.endswith("_en.txt"):

                with copen(f_gproj + f_g + f_novel, "w",
                           encoding="utf-8") as gproj_f:
                    gproj_f.write("<d src=\"%s\">\n" % f_novel)

                    with copen(f_gutenberg + f_g + f_novel,
                               encoding="utf-8") as novel_f:
                        j = 2

                        for i, line in enumerate(novel_f.readlines()):
                            if i in xrange(j - 2, j):
                                line = line.strip()

                                if line.startswith("<S"):
                                    m = match(
                                        ".*sentNum:([0-9]+).*F:([0|1]) I:([0|1])",
                                        line)
                                    gproj_f.write(
                                        "<s id=\"%s\" f=\"%s\" i=\"%s\">" %
                                        (m.group(1), m.group(2), m.group(3)))
                                else:
                                    gproj_f.write("%s</s>\n" % line)
                            elif i == j:
                                j += 4
                    gproj_f.write("</d>\n")
def line_iter(path1, path2):
    file1 = copen(path1, encoding='utf-8')
    file2 = copen(path2, encoding='utf-8')
    for p_line in izip(file1, file2):
        yield p_line
    file1.close()
    file2.close()
Exemple #6
0
def processGutenberg(f_gutenberg, f_gproj):
    """
    Processing the Project Gutenberg corpus.
    """
    for f_g in ["test/", "train/"]:
        createPath(f_gproj + f_g)

        for f_novel in listdir(f_gutenberg + f_g):
            if f_novel.endswith("_en.txt"):

                with copen(f_gproj + f_g + f_novel, "w", encoding="utf-8") as gproj_f:
                    gproj_f.write("<d src=\"%s\">\n" % f_novel)

                    with copen(f_gutenberg + f_g + f_novel, encoding="utf-8") as novel_f:
                        j = 2

                        for i, line in enumerate(novel_f.readlines()):
                            if i in xrange(j - 2, j):
                                line = line.strip()

                                if line.startswith("<S"):
                                    m = match(".*sentNum:([0-9]+).*F:([0|1]) I:([0|1])", line)
                                    gproj_f.write("<s id=\"%s\" f=\"%s\" i=\"%s\">" % (m.group(1), m.group(2), m.group(3)))
                                else:
                                    gproj_f.write("%s</s>\n" % line)
                            elif i == j:
                                j += 4
                    gproj_f.write("</d>\n")
Exemple #7
0
    def append(self, ical, filename=None):
        """Append a Remind command generated from the iCalendar to the file"""
        if not filename:
            filename = self._filename
        elif filename not in self._icals:
            return

        with self._lock:
            outdat = self.to_reminders(readOne(ical))
            copen(filename, 'a', encoding='utf-8').write(outdat.decode('utf-8'))
Exemple #8
0
def convert_file(file_path):
    print("[*]", file_path, "fixed!")
    foriginal = copen(file_path, "r", "utf8")
    content = foriginal.read()
    foriginal.close()

    ccontent = fix_encoding(content, ENCODING, NORMALIZE, True)
    fconverted = copen(file_path, "w", "utf8")
    fconverted.write(ccontent)
    fconverted.close()
Exemple #9
0
def replace(source_file_path, pattern, substring, is_regexp):
    _, target_file_path = mkstemp()

    with copen(target_file_path, 'w', 'utf-8') as target_file:
        with copen(source_file_path, 'r', 'utf-8') as source_file:
            for line in source_file:
                if is_regexp:
                  target_file.write(sub(pattern, substring, line))
                else:
                  target_file.write(line.replace(pattern, substring))
    remove(source_file_path)
    move(target_file_path, source_file_path)
Exemple #10
0
def convert_file(file_path):
    foriginal = copen(file_path, "r", "utf8")
    content = foriginal.read()
    foriginal.close()

    for codec in codecs:
        print("[*] codec:", codec)

        fconverted = copen(file_path.replace(".", "_%s." % codec), "w", "utf8")
        fconverted.write(
            content.encode(codec, "ignore").decode("utf8", "ignore"))
        fconverted.close()
Exemple #11
0
def convert_file(srcFile, dstFile, delim = DELIM,
                 src_codec=SOURCE_CODEC, dst_codec=DEST_CODEC):
    '''Convert a CSV file to standard format'''
    # From http://stackoverflow.com/a/191403
    with copen(srcFile, "r", SOURCE_CODEC) as sourceFile:
        with copen(dstFile, "w", DEST_CODEC) as targetFile:
            while True:
                line = sourceFile.readline()
                line = ",".join(line.split(delim))
                if not line:
                    break
                targetFile.write(line)
Exemple #12
0
def gettimes(fname, nframes=None):
    if nframes is None:
        try:
            with pytiff.Tiff(fname) as handle:
                tags = handle.read_tags()
                nframes = int(tags['image_description'].split()[2][7:])
        except:
            print("I could not get the number of frames, please provide it")
            return

    with copen(fname, "r", "windows-1252") as f:
        j = 0
        times = zeros((nframes))
        while True:
            try:
                line = f.readline()
                # print(line)
                linesp = line.replace('\x00', '').strip().split()
                if len(linesp) == 5:
                    if linesp[2] == 'Time_From_Last':
                        k, t = linesp[1], linesp[-1]
                        # print(int(k),float(t))
                        times[int(k) - 1] = float(t)
                        j = j + 1
            except:
                break
            if j >= nframes:
                break
    return (times)
Exemple #13
0
def prepare_articles(names):
    '''saves tagged articles about given entities in a cache'''
    for f in glob.glob(join(raw_articles_path, "*.txt*")):
        os.remove(f)
    found = False
    link_dictionaries = {}
    for i, name in enumerate(names):
        try:
            get_article(name)
        except ArticleNotFoundError:
            try:
                article, link_dictionary = get_raw_article(name)
                link_dictionaries[i] = link_dictionary
            except ArticleNotFoundError:
                continue
            found = True
            article = '\n'.join(article.split('\n')[:article_sentence_limit])
            out = copen(join(raw_articles_path, '%d.txt' % i), 'w', 'utf-8')
            print >> out, article
    if found:
        articles = lt.run_nlptools(link_dictionaries)
        for f in glob.glob(join(raw_articles_path, "*.txt*")):
            os.remove(f)
        #save processed articles
        for i, article in articles.iteritems():
            Pickler.store(article, articles_cache_path % names[i])
Exemple #14
0
    def remove(self, uid, filename=None):
        """Remove the Remind command with the uid from the file"""
        if not filename:
            filename = self._filename
        elif filename not in self._icals:
            return

        uid = uid.split('@')[0]

        with self._lock:
            rem = copen(filename, encoding='utf-8').readlines()
            for (index, line) in enumerate(rem):
                if uid == md5(line[:-1].encode('utf-8')).hexdigest():
                    del rem[index]
                    copen(filename, 'w', encoding='utf-8').writelines(rem)
                    break
 def _init_data_list(self):
     self.data_list = []
     with copen(self.jsons_file, mode='r', encoding='latin1') as f:
         for line in f:
             new_relation = Relation.from_json(
                 json.loads(line, object_pairs_hook=OrderedDict))
             self.data_list.append(new_relation)
Exemple #16
0
    def replace(self, uid, ical, filename=None):
        """Update the Remind command with the uid in the file with the new iCalendar"""
        if not filename:
            filename = self._filename
        elif filename not in self._icals:
            return

        uid = uid.split('@')[0]

        with self._lock:
            rem = copen(filename, encoding='utf-8').readlines()
            for (index, line) in enumerate(rem):
                if uid == md5(line[:-1].encode('utf-8')).hexdigest():
                    rem[index] = self.to_reminders(readOne(ical))
                    copen(filename, 'w', encoding='utf-8').writelines(rem)
                    break
Exemple #17
0
 def _get_comment_content(self, comm_id):
     try:
         with copen(self._path_to_comment(comm_id), "r", "utf-8") as f:
             comm = f.read()
         return comm
     except IOError:
         return None
Exemple #18
0
 def _get_post_content(self, post_id):
     try:
         with copen(self._path_to_post(post_id), "r", "utf-8") as f:
             post = f.read()
         return post
     except IOError:
         return None
Exemple #19
0
    def draw_title(path, title="", x=400, y=410, font_size=20, colour="black"):
        """Draws title into svg.
    
        :param path: path to svg
        :param title: text do input into picture
        :param x: starting x position of the title
        :param y: starting y position of the title
        :param font_size: font size of labels -- default is 20
        :param colour: title colour
        """

        with copen(path, "r+", "utf-8") as svg:
            svg.seek(-6, 2)
            svg.write(
                '\n<text x ="'
                + str(x)
                + '" y="'
                + str(y)
                + '" stroke="none" font-size="'
                + str(font_size)
                + '" fill="'
                + colour
                + '" font-family="sans-serif">'
                + title
                + "</text>\n</svg>"
            )
Exemple #20
0
 def load_file(self, file=-1, changing=False, label="", border=""):
     if file == -1:
         file = self.file
     try:
         with copen(file, "r", encoding="utf-8") as f:
             content = loads(f.read())
     except Exception as e:
         if changing:
             label.config(fg=wc.WRONG)
             sound("source\\wrong.wav")
             if border != "":
                 border.config(bg=wc.WRONG)
                 self.root.update()
                 sleep(0.2)
                 border.config(bg="black")
             label["text"] = "The file {} wasn't found.".format(file)
             #print("Works")
         return -1
     else:
         load_colors(content["Colors"])
         self.content = content
         if type(self.content) != type(1):
             self.wordlist = list(self.content["Language"])
         if changing:
             if border != "":
                 border.config(bg=wc.GOOD)
                 self.root.update()
                 sound("source\\correct.wav")
                 sleep(0.2)
                 border.config(bg="black")
             label.config(fg="black")
             label["text"] = "File {} loaded correctly.".format(file)
             self.newWord()
Exemple #21
0
def save_corpora(corpora_iter, path1, path2):
    """TODO: Docstring for save_corpora.

    :corpora_iter: TODO
    :path1: TODO
    :path2: TODO
    :returns: TODO

    """
    with copen(path1, 'w', encoding='utf-8') as f1,\
            copen(path2, 'w', encoding='utf-8') as f2:
        for sent1, sent2 in corpora_iter:
            sent1 = sent1.replace('\n', ' ').strip() + '\n'
            sent2 = sent2.replace('\n', ' ').strip() + '\n'
            f1.write(sent1)
            f2.write(sent2)
Exemple #22
0
    def analyze(self,data,_data,filename):
        listheaders = []
        listpayloads = []

        for _ in data:
            listheaders.append(str( _["fields"]))
            listpayloads.append(str( _["payload"]))

        headers = "".join(listheaders)
        content = "".join(listpayloads)

        with copen(self.intell+filename,"r",encoding='utf8') as f:
            for _ in loads(f.read()):
                try:
                    if "Type" in _ and "WQREGEX" in _["Type"]:
                        if _["Options"]["Word"] == "Normal" and "Header_Detection" in _:
                            x = search(compile(r"{}".format(_["Header_Detection"]),_["Options"]["Flag"]),headers)
                        elif _["Options"]["Word"] == "Normal" and "Content_Detection" in _:
                            x = search(compile(r"{}".format(_["Content_Detection"]),_["Options"]["Flag"]),content)
                        if x is not None:
                            _data.append({"Matched":"1","Required":_["Options"]["Required"],"WAF":_["Name"],"Detected":x.group()})
                except:
                    pass

        self.check_proxy_bypass(data,_data)
def save_corpora(corpora_iter, path1, path2):
    """TODO: Docstring for save_corpora.

    :corpora_iter: TODO
    :path1: TODO
    :path2: TODO
    :returns: TODO

    """
    with copen(path1, 'w', encoding='utf-8') as f1,\
            copen(path2, 'w', encoding='utf-8') as f2:
        for sent1, sent2 in corpora_iter:
            sent1 = sent1.replace('\n', ' ').strip() + '\n'
            sent2 = sent2.replace('\n', ' ').strip() + '\n'
            f1.write(sent1)
            f2.write(sent2)
Exemple #24
0
def find_nearest_category_text():
    """Finds closest article to process for each IAB sub category."""

    #grab IAB subcats
    # - must be lower case
    # - must have spaces replaced by underscores
    iab_sub_cats = []
    cats = create_category_dictionary(iab)
    for k, v in cats.iteritems():
        for x in v:
            iab_sub_cats.append(x.lower().replace(" ", "_"))
    iab_sub_cats = set(iab_sub_cats)

    #process titles in file

    nearest_titles = defaultdict(list)

    with copen("id_to_page.tsv", encoding='utf8') as f:
        for n, line in enumerate(f):
            if line != "":
                try:
                    title = line[:-1].split('\t')[1]
                    comparison_title = title.lower()
                    if comparison_title in iab_sub_cats:
                        nearest_titles[comparison_title].append(title)

                except Exception, e:
                    print Exception, e

            if n % 1000000 == 0:
                print "Done {0} found {1} of {2}".format(
                    n, len(nearest_titles), len(iab_sub_cats))
Exemple #25
0
 def rcompile_and_find(self, data, filename):
     '''
     parse the detections and check them against wordsstripped
     '''
     with copen(filename, "r", encoding='utf8') as file:
         for _ in loads(file.read()):
             with ignore_excpetion(Exception):
                 if "Type" in _ and "QREGEX" in _["Type"]:
                     _list = []
                     tempmatches = 0
                     for item in _["Detection"]:
                         if _["Options"]["Word"] == "Normal":
                             temp_value = rsearch(
                                 rcompile(r"{}".format(item),
                                          _["Options"]["Flag"]),
                                 self.wordsstripped)
                         elif _["Options"]["Word"] != "Normal":
                             temp_value = rsearch(
                                 rcompile(r"\b{}\b".format(item),
                                          _["Options"]["Flag"]),
                                 self.wordsstripped)
                         if temp_value is not None:
                             _list.append(temp_value.group())
                             tempmatches += 1
                     if _list and tempmatches >= _["Options"]["Required"]:
                         data.append({
                             "Matched": tempmatches,
                             "Required": _["Options"]["Required"],
                             "Behavior": _["Name"],
                             "Detected": ', '.join(_list)
                         })
Exemple #26
0
 def test_windows1252(self):
     vtt_string = copen(self.windows_path, encoding='windows-1252').read()
     vtt_file = from_string(vtt_string, encoding='windows-1252', eol='\r\n')
     self.assertEqual(len(vtt_file), 1332)
     self.assertEqual(vtt_file.eol, '\r\n')
     self.assertRaises(UnicodeDecodeError, vttopen,
                       self.utf8_path, encoding='ascii')
def extract_rows(auto_parsed_file_name):
	wsj_id_re = re.compile('wsj_(\d\d\d\d)')
	wsj_id = wsj_id_re.search(auto_parsed_file_name).group(1)
	corenlp_result = json.loads(copen(auto_parsed_file_name, mode='r').read())
	sentences = corenlp_result['sentences']
	rows = []
	for i, sentence in enumerate(sentences):
		split_clauses_from_sentence(sentence)
		sentence_start = sentence['words'][0][1]['CharacterOffsetBegin']
		sentence_end = sentence['words'][-1][1]['CharacterOffsetEnd']
		parse_json_string = json.dumps(sentence, cls=TJsonEncoder)
		coref_json_string = json.dumps([])
		if 'coref' in corenlp_result:
			coref_json_string = json.dumps(corenlp_result['coref'])
		row = {
			'wsj_section': int(wsj_id[0:2]),
			'wsj_id': wsj_id,
			'sentence_id': i,
			'parse_json': parse_json_string,
			'sentence_start': sentence_start,
			'sentence_end': sentence_end,
			'sentence_text': sentence['text'],
			'coreference_json': coref_json_string
		}
		rows.append(row)
	return rows
def write_training_file(file_name, new_file_name, counter, cutoff):
    """Write the file such that the features are pruned based on the cutoff

    It slows down a bit because we have to re-read the file instead using 
    whatever is already in the memory.
    """
    with copen(file_name, encoding='utf8') as f:
        lines = f.readlines()
    new_training_file = copen(new_file_name, 'w', encoding='utf8')
    for line in lines:
        name, label, features = line.strip().split('\t')
        features = [x for x in features.split(' ') if counter[x] > cutoff]
        if len(features) == 0: 
            features = ['NO_FEATURE']
        new_training_file.write('%s\t%s\t%s\n' % (name, label, ' '.join(features)))
    new_training_file.close()
def get_synonims():
    """
        Retuns list of pair (word, list_of_synonims).
    """
    print('Started reading synonims file...')
    SYN_NAME = 'engine/synonims.txt'

    data_reader = copen(SYN_NAME, 'r', 'windows-1251')
    raw_data = data_reader.read()
    data_reader.close()

    data = raw_data.split('\r\n')
    data = list(map(lambda x: tuple(x.split('|')), data))
    # interested in synonims to one word precisely:
    data = list(filter(lambda x: '?' not in x[0] and ' ' not in x[0], data)) 

    ret = []
    for line in data:
        if len(line) < 2:
            continue
        word = InverseIndex._clean(line[0])
        syno = line[1]
        syno = syno.split(',')
        # don't bother with multiple choices, like 'you (shall, will)'
        syno = list(filter(lambda x: ')' not in x and '(' not in x, syno))
        syno = list(map(lambda x: InverseIndex._clean(x), syno))
        
        ret.append((word, syno))
        
    print('Finished reading synonims file!')
    return ret
Exemple #30
0
def find_nearest_category_text():
	"""Finds closest article to process for each IAB sub category."""
	
	#grab IAB subcats
	# - must be lower case
	# - must have spaces replaced by underscores
	iab_sub_cats = []
	cats = create_category_dictionary(iab)
	for k,v in cats.iteritems():
		for x in v:
			iab_sub_cats.append(x.lower().replace(" ", "_"))
	iab_sub_cats = set(iab_sub_cats)
	
	#process titles in file
	
	nearest_titles = defaultdict(list)
	
	with copen("id_to_page.tsv", encoding='utf8') as f:
		for n, line in enumerate(f):
			if line != "":
				try:
					title = line[:-1].split('\t')[1]
					comparison_title = title.lower()
					if comparison_title in iab_sub_cats:
						nearest_titles[comparison_title].append(title)
					
				except Exception, e:
					print Exception, e
			
			if n % 1000000 == 0:
				print "Done {0} found {1} of {2}".format(n, len(nearest_titles), len(iab_sub_cats))
def clean_answer(answer, prologue, code):
    ret = []
    outp = copen('outp', code, encoding='utf-8')
    outp.write(prologue + '\n')
    for feature in answer:
        outp.write(feature[0] + ' ' + feature[1] + '\n')
    return ret
Exemple #32
0
 def _add_to_file(self, filename, line, join=False):
     varname = make_varname(filename)
     if hasattr(self, varname):
         setattr(self, varname, getattr(self, varname) +
         [self.fields_separator.join(line),])
     with copen(filename, "a", "utf-8") as f:
         f.write((self.fields_separator.join(line) if join else line) + "\n")
def prepare_articles(names):
    '''saves tagged articles about given entities in a cache'''
    for f in glob.glob(join(raw_articles_path, "*.txt*")):
        os.remove(f)
    found = False
    link_dictionaries = {}
    for i, name in enumerate(names):
        try:
            get_article(name)
        except ArticleNotFoundError:
            try:
                article, link_dictionary = get_raw_article(name)
                link_dictionaries[i] = link_dictionary
            except ArticleNotFoundError:
                continue
            found = True
            article = '\n'.join(article.split('\n')[: article_sentence_limit])
            out = copen(join(raw_articles_path, '%d.txt' % i), 'w', 'utf-8')
            print >>out, article
    if found:
        articles = lt.run_nlptools(link_dictionaries)
        for f in glob.glob(join(raw_articles_path, "*.txt*")):
            os.remove(f)
        #save processed articles
        for i, article in articles.iteritems():
            Pickler.store(article, articles_cache_path % names[i])
Exemple #34
0
def gettimes(fname, nframes=None):
    if nframes is None:
        try:
            tags = readtifInfo(fname, verbose=False)
            nframes = int([d for d in info[270].split('\n') \
                         if d.find('frames')>=0][0].split('=')[-1])

            # ~ with pytiff.Tiff(fname) as handle:
            # ~ tags = handle.read_tags()
            # ~ nframes = int(tags['image_description'].split()[2][7:])
        except:
            print("I could not get the number of frames, please provide it")
            return

    with copen(fname, "r", "windows-1252") as f:
        j = 0
        times = zeros((nframes))
        while True:
            try:
                # ~ print('linea ',j)
                line = f.readline()
                # ~ print(line)
                linesp = line.replace('\x00', '').strip().split()
                if len(linesp) == 5:
                    if linesp[2] == 'Time_From_Last':
                        k, t = linesp[1], linesp[-1]
                        # print(int(k),float(t))
                        times[int(k) - 1] = float(t)
                        j = j + 1
            except Exception as e:
                print(e)
                break
            if j >= nframes:
                break
    return (times)
Exemple #35
0
def crawl_ted(langs, output, ignore_urls=()):
    crawl_id = str(int(time.time()))
    ignore = [u.split('/')[-1] for u in ignore_urls]
    for lang in langs:
        log.info('Language %s', lang)
        path = os.path.join(output, crawl_id + '_' + lang)
        log.info('Saving into file %s', path)
        with copen(path, 'w', encoding='utf-8') as f:
            for url in get_pages():
                log.info('Page %s', url)
                html = get_html_retry(url)
                if html is None:
                    break
                try:
                    for talk_url in get_processed(html, get_talks):
                        if talk_url.split('/')[-1] in ignore:
                            log.info('Ignoring %s', talk_url)
                            continue
                        talk_url += '/transcript?language=' + lang
                        log.info('Talk %s', talk_url)
                        talk_html = get_html_retry(talk_url)
                        if talk_html is None:
                            continue
                        try:
                            lines = get_processed(talk_html, get_transcript)
                        except ParseError:
                            pass
                        else:
                            f.write('\n'.join(lines))
                except ParseError:
                    break
    log.info('Done')
Exemple #36
0
def main():
    for file in listdir():
        if file[-5:].lower() == ".json":
            with copen(file, "r", encoding="utf-8") as f:
                content = loads(f.read())
            print("Configuring {}...".format(file))
            if "Colors" in content:
                content["Colors"]["WIDTH"] = 700
                content["Colors"]["HEIGHT"] = 400
                content["Colors"]["MAIN_COLOR"] = "#00FF80"
                content["Colors"]["SECONDARY_COLOR"] = "#ECF0F1"
                content["Colors"]["WRONG"] = "#DF013A"
                content["Colors"]["GOOD"] = "#00FF80"
                content["Colors"]["ADD_WORD"] = "F7FE2E"
            with copen(file, "w", encoding="utf-8") as f:
                f.write(dumps(content, indent=4))
Exemple #37
0
def get_long_description():
    """ Retrieve the long description from DESCRIPTION.rst """
    here = os.path.abspath(os.path.dirname(__file__))

    with copen(os.path.join(here, 'README.rst'),
               encoding='utf-8') as description:
        return description.read()
Exemple #38
0
def runScript(command=None, tempfile=None):
    timingfname = None
    scriptfname = None
    CMD = ['script']

    if tempfile:
        timingfname = "%s.timing" % str(tempfile)
        scriptfname = "%s.log" % str(tempfile)
        with open(timingfname, 'w'):
            with open(scriptfname, 'w'):
                pass
    else:
        with NamedTemporaryFile(delete=False) as timingf:
            with NamedTemporaryFile(delete=False) as scriptf:
                timingfname = timingf.name
                scriptfname = scriptf.name

    CMD.append('-t')

    if command:
        CMD.append('-c')
        CMD.append(command)

    CMD.append(scriptfname)

    with open(timingfname, 'w') as timingf:
        proc = Popen(CMD, stderr=timingf)
        proc.wait()

    return copen(scriptfname, encoding='utf-8', errors='replace'), \
           open(timingfname, 'r')
Exemple #39
0
 def compile_and_find(self, data, filename):
     '''
     parse the detections and check them against wordsstripped
     '''
     with copen(filename, "r", encoding='utf8') as f:
         for _ in loads(f.read()):
             try:
                 if "Type" in _ and "QREGEX" in _["Type"]:
                     _list = []
                     tempmatches = 0
                     for item in _["Detection"]:
                         if _["Options"]["Word"] == "Normal":
                             x = search(
                                 compile(r"{}".format(item),
                                         _["Options"]["Flag"]),
                                 self.wordsstripped)
                         elif _["Options"]["Word"] != "Normal":
                             #Functions end with A,W do not match using "Word" option
                             x = search(
                                 compile(r"\b{}\b".format(item),
                                         _["Options"]["Flag"]),
                                 self.wordsstripped)
                         if x is not None:
                             _list.append(x.group())
                             tempmatches += 1
                     if _list and tempmatches >= _["Options"]["Required"]:
                         data.append({
                             "Matched": tempmatches,
                             "Required": _["Options"]["Required"],
                             "Behavior": _["Name"],
                             "Detected": ','.join(_list)
                         })
             except:
                 pass
Exemple #40
0
 def get_file_content(self, path ):
     # aa = open(path, "rb")
     aa = copen(path, "rb",encoding="U8")
     cont = aa.read()
     # cont = cont.replace("&nbsp;"," ")
     cont = cont.replace("&","DHTN__")
     return cont
Exemple #41
0
    def process_chapters(self, db_book, book_id, book_link):
        """
        Extract the chapters, and do some initial processing of the verses

        :param book: An OpenLP bible database book object
        :param chapters: parsed chapters
        :return: None
        """
        log.debug(book_link)
        book_file = os.path.join(self.base_dir, os.path.normpath(book_link))
        with copen(book_file, encoding='utf-8', errors='ignore') as f:
            page = f.read()
        soup = BeautifulSoup(page, 'lxml')
        header_div = soup.find('div', 'textHeader')
        chapters_p = header_div.find('p')
        if not chapters_p:
            chapters_p = soup.p
        log.debug(chapters_p)
        for item in chapters_p.contents:
            if self.stop_import_flag:
                break
            if isinstance(item, Tag) and item.name in ['a', 'span']:
                chapter_number = int(item.string.strip())
                self.set_current_chapter(db_book.name, chapter_number)
                self.process_verses(db_book, book_id, chapter_number)
Exemple #42
0
    def process_books(self):
        """
        Extract and create the bible books from the parsed html

        :param bible_data: parsed xml
        :return: None
        """
        with copen(os.path.join(self.base_dir, 'index.htm'), encoding='utf-8', errors='ignore') as index_file:
            page = index_file.read()
        soup = BeautifulSoup(page, 'lxml')
        bible_books = soup.find('div', 'textOptions').find_all('li')
        book_count = len(bible_books)
        for li_book in bible_books:
            log.debug(li_book)
            if self.stop_import_flag:
                break
            # Sometimes the structure is "[1] <a>Genesis</a>", and sometimes it's "<a>[1] Genesis</a>"
            if isinstance(li_book.contents[0], NavigableString) and str(li_book.contents[0]).strip():
                book_string = str(li_book.contents[0])
                book_name = str(li_book.a.contents[0])
            elif li_book.a:
                book_string, book_name = str(li_book.a.contents[0]).split(' ', 1)
            book_link = li_book.a['href']
            book_id = int(BOOK_NUMBER_PATTERN.search(book_string).group(1))
            book_name = book_name.strip()
            db_book = self.find_and_create_book(book_name, book_count, self.language_id, book_id)
            self.process_chapters(db_book, book_id, book_link)
            self.session.commit()
Exemple #43
0
 def process_verses(self, db_book, book_number, chapter_number):
     """
     Get the verses for a particular book
     """
     chapter_file_name = os.path.join(self.base_dir, '{:02d}'.format(book_number), '{}.htm'.format(chapter_number))
     with copen(chapter_file_name, encoding='utf-8', errors='ignore') as chapter_file:
         page = chapter_file.read()
     soup = BeautifulSoup(page, 'lxml')
     text_body = soup.find('div', 'textBody')
     if text_body:
         verses_p = text_body.find('p')
     else:
         verses_p = soup.find_all('p')[2]
     verse_number = 0
     verse_text = ''
     for item in verses_p.contents:
         if self.stop_import_flag:
             break
         if isinstance(item, Tag) and 'verse' in item.get('class', []):
             if verse_number > 0:
                 self.process_verse(db_book, chapter_number, verse_number, verse_text.strip())
             verse_number = int(item.string.strip())
             verse_text = ''
         elif isinstance(item, NavigableString):
             verse_text += str(item)
         elif isinstance(item, Tag) and item.name in ['span', 'a']:
             verse_text += str(item.string)
         else:
             log.warning('Can\'t store %s', item)
     self.process_verse(db_book, chapter_number, verse_number, verse_text.strip())
def main(args):
    dict_w = []
    stop_w = []
    if args.dictionary:
        dict_w = load_dictionary(args.dictionary)
    if args.stop_words:
        stop_w = load_dictionary(args.stop_words)
    file1 = copen(gen_name(args.paths[0]), 'w', encoding='utf-8')
    file2 = copen(gen_name(args.paths[1]), 'w', encoding='utf-8')
    for i, pair in enumerate(line_iter(*args.paths)):
        if is_ok(dict_w, stop_w, pair, abs_t=args.abs_diff, rel_t=args.rel_diff):
            file1.write(pair[0])
            file2.write(pair[1])
        if i % 1000 == 0:
            log.info('Lines processed %i', i+1)
    file1.close()
    file2.close()
Exemple #45
0
def write_history(text,author):
    """\
    write the quote to a the history file ~/qhistory in an format that you can use it for fortune (strfile)
    """
    with copen(expanduser('~/.qhistory'), 'a', 'utf8') as fh:
        fh.write(text + "\n")
        fh.write(' ' * 50 + '- ' + author +"\n")
        fh.write("%\n")
Exemple #46
0
def removeLabels(f_gold, f_test, slda=True):
    """ Remove labels from the gold for inference.

    Preprocessing:
    tail -n $(25%) corpus > gold
    sed -i "$(total - 25%),$(total)d"
    """
    print "Removing labels"

    with copen(f_gold, "r", encoding="utf-8") as gold_f:
        with copen(f_test, "w", encoding="utf-8") as test_f:
            for line in gold_f:
                if not slda:
                    test_f.write(sub("\[[0-9]\] ", "", line))
                else:
                    line = sub("\]", "", line)
                    test_f.write(sub("\[[0-9]\|", "", line))
Exemple #47
0
def removeLabels(f_gold, f_test, slda=True):
    """ Remove labels from the gold for inference.

    Preprocessing:
    tail -n $(25%) corpus > gold
    sed -i "$(total - 25%),$(total)d"
    """
    print "Removing labels"

    with copen(f_gold, "r", encoding="utf-8") as gold_f:
        with copen(f_test, "w", encoding="utf-8") as test_f:
            for line in gold_f:
                if not slda:
                    test_f.write(sub("\[[0-9]\] ", "", line))
                else:
                    line = sub("\]", "", line)
                    test_f.write(sub("\[[0-9]\|", "", line))
Exemple #48
0
def splitText(f_lda, a, f_train, f_gold):
    """ Splitting the text in a certain percentage. """
    with copen(f_lda, encoding="utf-8") as corpus_f:
        corpus = corpus_f.readlines()

    x = len(corpus) / 100 * a

    train = corpus[0:x]
    gold = corpus[x:]

    with copen(f_train, "w", encoding="utf-8") as train_f:
        for line in train:
            train_f.write(line)

    with copen(f_gold, "w", encoding="utf-8") as gold_f:
        for line in gold:
            gold_f.write(line)
    def __init__(self):
        """Sets up the classifier"""

        #import the main payload with keywords for matching/blocking
        with copen("payload_lica.json", encoding='utf8') as f:
            self.payload = load(f)

        #Build a mapping in memory of keyword to category
        #The payload is kept in the reverse format to make it easier to edit
        self.positive_keywords = {}
        for top_level, sub_level in self.payload['positive_words'].iteritems():
            for category, keywords in sub_level.iteritems():
                for keyword in keywords:
                    self.positive_keywords[keyword] = [top_level, category]

        #create a simple ignored words checker
        self.ignored_words = set(self.payload["ignore_words"])

        #import the domain rules
        with copen("payload_domain_rules.json", encoding='utf8') as f:
            self.rules = load(f)

        #convert the host rules into an easily searchable format
        # e.g. 		"au.movies.yahoo.com": "television",
        # 			should be: "yahoo.com": { 'movies': { 'au': ['arts & entertainment', 'television'] } }

        self.host_rules = defaultdict(dict)
        for host_rule, category in self.rules['host_rules'].iteritems():
            domain = extract(
                host_rule
            )  #ExtractResult(subdomain='au.movies', domain='yahoo', suffix='com')
            tld = domain.domain + "." + domain.suffix  # yahoo.com
            host = domain.subdomain.split('.')  #['au', 'movies']
            tree = make_tree(
                host[::-1], category
            )  #{ 'movies': { 'au': ['arts & entertainment', 'television'] } }
            merge(self.host_rules,
                  {tld: tree})  #merge the host rules with this new data

        #convert the path rules into an easily searchable format
        self.path_rules = defaultdict(dict)
        for path_rule, category in self.rules['path_rules'].iteritems():
            domain = extract(path_rule)
            tld = domain.domain + "." + domain.suffix  #sort of ignoring host+path rules, those can be covered by full DFR later
            path = path_rule.split('/')[1]
            self.path_rules[tld][path] = category
Exemple #50
0
def splitText(f_lda, a, f_train, f_gold):
    """ Splitting the text in a certain percentage. """
    with copen(f_lda, encoding="utf-8") as corpus_f:
        corpus = corpus_f.readlines()

    x = len(corpus) / 100 * a

    train = corpus[0:x]
    gold = corpus[x:]

    with copen(f_train, "w", encoding="utf-8") as train_f:
        for line in train:
            train_f.write(line)

    with copen(f_gold, "w", encoding="utf-8") as gold_f:
        for line in gold:
            gold_f.write(line)
Exemple #51
0
def parse_pdtb_file(pdtb_file_name):
	relation_jsons = []
	with copen(pdtb_file_name, mode='r', encoding='latin1') as f:
		lines = deque([x for x in f.readlines()])
	while len(lines) > 0:
		relation_json = relation(lines)
		relation_jsons.append(relation_json)
	return relation_jsons
Exemple #52
0
def make_ontology_file():
	wikipedia_page_keywords = {}		#3) "Down_to_Earth_%28Justin_Bieber_song%29": ['one', 'girl']

	with copen("topic_signatures_en.tsv", encoding='utf8') as raw:
		for n, line in enumerate(raw):
			
			line = line[:-1].split('\t') #remove the newline character and separate title from rest
			
			wiki_article_title = line[0] #useful
			
			rest = line[1].split('"')
			page_text_salient_keywords = [x for x in rest[-1].split() if x not in STOPWORDS] #useful
			
			wikipedia_page_keywords[wiki_article_title] = page_text_salient_keywords
			
			if n % 100000 == 0:
				print "Processed {0}% of the pages".format((n/3500000.0)*100)
		print "Total: {0} articles".format(len(wikipedia_page_keywords))
	
	with copen("article_category_matrix.tsv", encoding='utf8') as f:
		#has 144k categories, 97k without numbers
		
		article_phrase_matrix = defaultdict(lambda: defaultdict(int))
		
		for n, line in enumerate(f):
			line = line.split("\t")
			category = line[0]
			if not re.match('.*[0-9].*', category): #as long as the category doesn't have a number in it
				articles = line[1:]
				for article in articles:
					if article in wikipedia_page_keywords:
						for phrase in wikipedia_page_keywords[article]:
							article_phrase_matrix[category][phrase] += 1
			if n % 10000 == 0:
				print "Processed {0}".format(n)
	
	#now export in the form:
	#category \t phrase \t count \t phrase \t count
	with copen('payload.lwca', 'w', encoding='utf8') as f:
		for category, words in article_phrase_matrix.iteritems():
			phrases = []
			for phrase, count in sorted(words.items(), key=lambda x: x[1], reverse=True):
				phrases.append(u"{0}\t{1}".format(phrase, count))
			
			f.write(u"{0}\t{1}\n".format(category, '\t'.join(phrases)))
Exemple #53
0
 def output_file(self):
     if not hasattr(self, '_output_file'):
         if self.output_file_path:
             self._output_file = copen(self.output_file_path,
                                       'w+',
                                       encoding=self.output_encoding)
         else:
             self._output_file = stdout
     return self._output_file
Exemple #54
0
 def __init__(self, file, *args, **kwargs):
     self.file = file if file[1] == ':' else '%s/%s' % ('/'.join(
         sys.argv[0].split('/')[:-1]), file)
     with copen(file, 'r', encoding='utf-8') as f:
         self.update(
             dict([(x.split(self.kvSep, 1)[0], x.split(self.kvSep, 1)[1])
                   for x in ''.join(f.read()).split(self.separator)
                   if x != '' and x != '\n']))
     self.update(dict(*args, **kwargs))
def parseFile(filename):
  result = []
  try:
    with copen(filename, "r", 'utf-8') as f:
      result = json.load(f)
  except:
    print("Failed to parse: " + filename)
    quit(1)
  return result
Exemple #56
0
 def test_windows1252(self):
     vtt_string = copen(self.windows_path, encoding='windows-1252').read()
     vtt_file = from_string(vtt_string, encoding='windows-1252', eol='\r\n')
     self.assertEqual(len(vtt_file), 1332)
     self.assertEqual(vtt_file.eol, '\r\n')
     self.assertRaises(UnicodeDecodeError,
                       vttopen,
                       self.utf8_path,
                       encoding='ascii')
Exemple #57
0
def evaluateSLDA(f_gold, f_results):
    """ Evaluation Metrics for SLDA. """
    print "Starting evaluation"

    counts = {
        "0": [0., 0., 0., 0.],
        "1": [0., 0., 0., 0.],
        "2": [0., 0., 0., 0.]
    }
    conf = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]

    with copen(f_gold, encoding="utf-8") as gold_f:
        with open(f_results) as results_f:
            for g, r in zip(gold_f, results_f):
                g_topics = findall("\[([0-9])\|", g)
                r_topics = r.split()

                for gold, pred in zip(g_topics, r_topics):
                    for k in counts:
                        if k == pred:
                            if k == gold:
                                # TP
                                conf[int(k)][int(gold)] += 1.
                                counts[k][0] += 1.
                            elif k != gold:
                                # FP
                                conf[int(k)][int(gold)] += 1.
                                counts[k][1] += 1
                        elif k != pred:
                            if k == gold:
                                # FN
                                counts[k][2] += 1.
                            elif k != gold:
                                # TN
                                counts[k][3] += 1.

    for k in counts:
        precision = (counts[k][0] / (counts[k][0] + counts[k][1]))
        recall = (counts[k][0] / (counts[k][0] + counts[k][2]))

        print "Precision of %s:\t\t\t%.2f" % (k, round((precision * 100), 2))
        print "Recall of %s:\t\t\t%.2f" % (k, round((recall * 100), 2))
        print "Specificity of %s:\t\t\t%.2f" % (
            k, round(
                ((counts[k][3] / (counts[k][3] + counts[k][1])) * 100), 2))
        print "Accuracy of %s:\t\t\t%.2f" % (
            k,
            round((
                ((counts[k][0] + counts[k][3]) /
                 (counts[k][0] + counts[k][1] + counts[k][2] + counts[k][3])) *
                100), 2))
        print "F1-Score of %s:\t\t\t%.2f" % (k,
                                             round(
                                                 (2 *
                                                  ((precision * recall) /
                                                   (precision + recall))), 2))
        print
Exemple #58
0
def fetch_kle_json(gist_id):
    """Returns the JSON for a keyboard-layout-editor URL.
    """
    cache_file = '/'.join((cache_dir, gist_id))
    headers = {}

    if exists(cache_file):
        # We have a cached copy
        file_stat = stat(cache_file)
        file_age = time() - file_stat.st_mtime

        if file_stat.st_size == 0:
            logging.warning('Removing zero-length cache file %s', cache_file)
            remove(cache_file)
        elif file_age < 30:
            logging.info('Using cache file %s (%s < 30)', cache_file, file_age)
            return copen(cache_file, encoding='UTF-8').read()
        else:
            headers['If-Modified-Since'] = strftime(
                '%a, %d %b %Y %H:%M:%S %Z', localtime(file_stat.st_mtime))
            logging.warning('Adding If-Modified-Since: %s to headers.',
                            headers['If-Modified-Since'])

    keyboard = requests.get(gist_url % gist_id, headers=headers)

    if keyboard.status_code == 304:
        logging.debug("Source for %s hasn't changed, loading from disk.",
                      cache_file)
        return copen(cache_file, encoding='UTF-8').read()

    keyboard = keyboard.json()

    for file in keyboard['files']:
        keyboard_text = keyboard['files'][file]['content']
        break  # First file wins, hope there's only one...

    if not exists(cache_dir):
        makedirs(cache_dir)

    with copen(cache_file, 'w', encoding='UTF-8') as fd:
        fd.write(keyboard_text)  # Write this to a cache file

    return keyboard_text