コード例 #1
0
def fix_file_encodings(inmainfolder,
                       outmainfolder,
                       in_ext="html",
                       in_encoding="cp1256",
                       out_ext="txt",
                       out_encoding="utf8"):

    subfolders = io_utils.getfoldernames_of_dir(inmainfolder)

    for subf in subfolders:

        p1 = os.path.join(inmainfolder, subf)
        fnames = io_utils.getfilenames_of_dir(p1, removeextension=True)

        o1 = io_utils.ensure_dir(os.path.join(outmainfolder, subf))
        print("In ", subf)
        for fname in fnames:

            p2 = os.path.join(p1, fname + "." + in_ext)
            o2 = os.path.join(o1, fname + "." + out_ext)
            fix_file(p2, o2, in_encoding, out_encoding)
            '''
            infile = codecs.open(p2, "r", encoding=in_encoding)
            text = infile.read()
            o2 = os.path.join(o1, fname+"."+out_ext)
            outfile = codecs.open(o2, "w", encoding=out_encoding)
            outfile.write(text)
            infile.close()
            outfile.close()
            '''

        print("Finished..\n")

    print("Done.")
コード例 #2
0
def count_tweets(folderpath, outfolder):

    N = 0
    Nr = 0
    Ntr = 0

    days = io_utils.getfoldernames_of_dir(folderpath)

    print(folderpath)
    for day in days:

        p1 = os.path.join(folderpath, day)

        fnames = io_utils.getfilenames_of_dir(p1, removeextension=False)

        for fname in fnames:

            p2 = os.path.join(p1, fname)
            '''
            lines = open(p2, "r").readlines()
            nlines = len(lines)
            '''

            tweets = lines2tweets(p2)
            ntweets = len(tweets)

            tr_tweets = count_lang_tweets(tweets, lang="tr")
            ntrtweets = len(tr_tweets)

            plain_tweets = count_nonreply_tweets(tr_tweets)
            nptweets = len(plain_tweets)

            print(" ", day, " / ", fname, "  # lines: ", ntweets,
                  " # tr_tweets: ", ntrtweets, " # non-reply tweeets: ",
                  nptweets)

            N += ntweets
            Nr += nptweets
            Ntr += ntrtweets

            if ntrtweets > 0:
                outpath_tr = os.path.join(outfolder, day + "_" + fname)
                json.dump(tr_tweets, open(outpath_tr, "w"))

            if nptweets > 0:
                outpath_nr = os.path.join(outfolder,
                                          day + "_" + fname + "-nonreply")
                json.dump(plain_tweets, open(outpath_nr, "w"))

    return N, Ntr, Nr
コード例 #3
0
def files_to_csv(mainfolder, outpath, fixfolder, in_encoding="utf-8"):

    textcol = "text"
    catcol = "polarity"
    other = "domain"

    labels = io_utils.getfoldernames_of_dir(mainfolder)

    rows = []

    for label in labels:
        p1 = os.path.join(mainfolder, label)
        fnames = io_utils.getfilenames_of_dir(p1, removeextension=False)

        print("Reading in ", label)

        for fname in fnames:

            print(" ", fname)
            p2 = os.path.join(p1, fname)

            try:
                f = codecs.open(p2, "r", encoding=in_encoding)
                text = f.read()

            except UnicodeDecodeError:
                f = codecs.open(p2, "r", encoding="cp1256")
                text = f.read()
                f2 = codecs.open(os.path.join(
                    io_utils.ensure_dir(os.path.join(fixfolder, label)),
                    fname),
                                 "w",
                                 encoding="utf")
                f2.write(text)

            text = text.strip()

            row = {textcol: text, catcol: label, other: fname}
            rows.append(row)

            f.close()

    df = pd.DataFrame(rows)
    df = df.sample(frac=1).reset_index(drop=True)

    if outpath:
        df.to_csv(outpath, sep="\t", index=False)
    return df
コード例 #4
0
def fix_texts_nested(infolder, outfolder):

    folders = io_utils.getfoldernames_of_dir(infolder)

    for folder in folders:
        inp1 = os.path.join(infolder, folder)
        outp1 = io_utils.ensure_dir(os.path.join(outfolder, folder))

        files = io_utils.getfilenames_of_dir(inp1, False)
        for file in files:

            inp2 = os.path.join(inp1, file)
            text = read_encoded_file(inp2)
            text = fix_text(text)
            outp2 = os.path.join(outp1, file)
            with open(outp2, "w") as f:
                f.write(text)
コード例 #5
0
def spam_mails_to_csv(mainfolder, outpath):

    csv_rows = []
    cats = io_utils.getfoldernames_of_dir(mainfolder)

    for cat in cats:

        p1 = os.path.join(mainfolder, cat)
        fnames = io_utils.getfilenames_of_dir(p1, removeextension=False)

        for fname in fnames:

            p2 = os.path.join(p1, fname)
            lines = open(p2, "r").readlines()
            items = extract_structure(lines)
            items["category"] = cat
            csv_rows.append(items)

    random.shuffle(csv_rows)
    df = pd.DataFrame(csv_rows)
    if outpath:
        df.to_csv(outpath, index=False, sep="\t")

    return df
コード例 #6
0
def detect_illstructured(mainfolder, outcsvpath, bodyfolder):

    fnames = []

    folders1 = io_utils.getfoldernames_of_dir(mainfolder)
    
    ngoodfiles = 0
    
    io_utils.initialize_csv_file(out_header, outcsvpath)
    
    for folder1 in folders1:
        # assuming the corpus has one more subfolder hierarchy
        p1 = os.path.join(mainfolder, folder1)
        txtfiles = io_utils.getfilenames_of_dir(p1, removeextension=False)

        # check fnames
        fnames.extend(txtfiles)

        
        for txtfile in txtfiles:
                
            fpath = os.path.join(p1, txtfile)
                
            # if line 0 or 1 has Sent:
            with open(fpath) as f:
                lines = f.readlines()
                
                date = lines[1]
                datep = re.match(r"\s*"+DATE, date)
                if datep:
                    
                    to = lines[2]
                    cc = lines[3]
                    subject = lines[4]
                    
                    date2 = extract_metadata(date, DATE)
                    to2 = extract_metadata(to, TO)
                    cc2 = extract_metadata(cc, CC)
                    subject2 = extract_metadata(subject, SUBJECT)
                    
                    bodylines = [i for i in lines[5:] if not i.isspace()]
                    body = "\n".join(bodylines)
                    body = body.strip()
                    
                    # record body aside
                    io_utils.todisc_txt(body.decode("utf-8"), os.path.join(bodyfolder, txtfile))
                    
                    items = [txtfile, "", to2, cc2, date2, subject2, str(len(body))]
                    io_utils.append_csv_cell_items(items, outcsvpath)
                
                '''
                datep = re.match(r"\s*"+DATE, txt)
                if datep:
                    if "@" in lines[0]:
                        ngoodfiles += 1
                    else:
                        print "- ", fpath
                    datestr = txt[datep.end():]
                else:
                    print fpath
                
                '''
                
            
    
    print("nfiles: ", str(len(fnames)))
    print("ngoodfiles: ", str(ngoodfiles))
コード例 #7
0
    text = mail1[i2 + len(p2):]
    text = text.strip()

    return text, from_


if __name__ == '__main__':
    '''
    p = '/home/dicle/Documents/data/email_datasets/enron/classified/enron_with_categories/2/1825.txt'
    print(get_email(p))
    '''

    emails = []

    folder = "/home/dicle/Documents/data/email_datasets/enron/classified/enron_with_categories"
    subfolders = io_utils.getfoldernames_of_dir(folder)
    id_ = 0
    for subfolder in subfolders:

        p1 = os.path.join(folder, subfolder)
        fnames = io_utils.getfilenames_of_dir(p1, removeextension=False)
        txtfiles = [i for i in fnames if i.endswith(".txt")]
        print(subfolder)
        for txtfile in txtfiles:

            print(" Reading ", txtfile)
            p2 = os.path.join(p1, txtfile)

            text, from_ = get_email(p2)

            emails.append({