Example #1
0
def _array2mat(fl, flo, rStopWords, StemWords):
    terms = csv.reader(open(fl, 'r'))
    if rStopWords:
        terms = filter(lambda term: term[1] not in STOPWORDS, terms)
    if StemWords:
        p = PorterStemmer()
        d = defaultdict(int)
        for term in terms:
            d[(term[0], p.stem(term[1], 0, len(term[1]) - 1))] += int(term[2])
        terms = map(lambda d: (d[0][0], d[0][1], d[1]), d.items())

    table = defaultdict(dict)
    words = set()
    for (year, word, item) in terms:
        table[num_if_is_number(year)][word] = item
        words.add(word)

    fo = csv.writer(open(flo, 'w'))
    years = sorted(table.keys())
    fo.writerow(['Words'] + [str(year) for year in years])
    for word in words:
        row = [table[year].get(word, '0') for year in years]
        fo.writerow([word] + row)

    return years[0], years[-1]
Example #2
0
def render(vis, request, info):
    info["message"] = []

    reload = int(request.args.get("reload", '0'))
    table = request.args.get("table", '')
    where = request.args.get("where", '1=1')
    field = request.args.get("field", '')
    view = request.args.get("view", '')
    minlen = request.args.get("MinCharLength", '3')
    rStopWords = int(request.args.get("RemoveStopWords", '0'))
    StemWords = int(request.args.get("StemWords", '0'))
    start = request.args.get("start", '0')  # start at 0

    limit = request.args.get("limit", '200')

    if len(table) == 0 or len(field) == 0:
        info["message"].append("table or field missing.")
        info["message_class"] = "failure"
    else:
        sql = "select word, count(*) as n from (select regexp_split_to_table(regexp_replace(lower(coalesce(%s,'')),'[^a-z0-9@]+',' ','g'),' ') as word, * from %s where %s) as a  where char_length(word) > %s group by 1 order by 2 desc limit %s offset %s" % (
        field, table, where, minlen, limit, start)

        (datfile, reload, result) = export_sql(sql, vis.config, reload, None, view)
        if len(result) > 0:
            info["message"].append(result)
            info["message_class"] = "failure"
        else:
            info["message_class"] = "success"
            if reload > 0:
                info["message"].append("Loaded fresh.")
            else:
                info["message"].append("Loading from cache. Use reload=1 to reload.")

        datfileNew = datfile + 'edited.csv'

        if reload:
            with open(datfile) as f:
                terms = csv.reader(f)
                #terms = map(lambda term: term(0).replace("'s",'').replace("'", '').replace(".", " ").replace(",", " "), terms)
                if rStopWords:
                    terms = filter(lambda term: term[0] not in STOPWORDS, terms)
                if StemWords:
                    p = PorterStemmer()
                    d = defaultdict(int)
                    for term in terms:
                        d[p.stem(term[0], 0, len(term[0]) - 1)] += int(term[1])
                    terms = d.items()

                header = ["text", "size"]
                with open(datfileNew, 'w') as f2:
                    cs = csv.writer(f2)
                    cs.writerow(header)
                    for term in terms:
                        cs.writerow(term)

        info["datfile"] = datfileNew

    pfield = request.args.get("pfield", [])
    info["title"] = "FIELDS: <em>%s</em> from <br />TABLE: <em>%s</em>" \
        % (','.join(pfield), table)
    info["title"] = Markup(info["title"])

    info["message"] = Markup(''.join('<p>%s</p>' % m for m in info["message"] if len(m) > 0))