Esempio n. 1
0
def filter_tweets_from_file(inputfilepath,
                            outputfilepath=None,
                            art=False,
                            frequency=False,
                            terms_to_remove=None):
    """
    :param inputfilepath: string, required
        Location path for loading the raw tweets

    :param outputfilepath: string, optional, default: None
        Location path for saving the filtered tweets

    :param art: boolean, optional, default: False
        Remove articles, pronouns and prepositions

    :param frequency: boolean, optional, default: False,
        Remove less used words

    :param terms_to_remove: list of string, optional, default: None
        List of terms to remove from each tweet

    :return tweets: list of tweets cleaned and filtered
    """

    tweets = filter_tweets(util.read_from_file(inputfilepath), outputfilepath,
                           art, frequency, terms_to_remove)

    return tweets
Esempio n. 2
0
def main():
    if len(sys.argv) != 2:
        print("Wrong Info passed")
        sys.exit(1)

    output = []
    words = sys.argv[1]

    index = read_from_file("index.dat")
    # find the file and occurance of all files from index for a word
    for word in words.split(' '):
        output.append(get_details(index, word))
    # ranking = sum the frequency and sort
    result = rank(output)
    display(result)
Esempio n. 3
0
def read_config(config_path):
    config = {
        "hostnames": [],
        "configs": {
            "CF_Account_ID": None,
            "CF_Zone_ID": None,
            "CF_Api_Token": None,
            "CF_Api_Key": None
        },
        "query_interval_seconds": 30
    }
    try:
        config_bytes = read_from_file(config_path, 'rb')
        config.update(json.loads(config_bytes))
    except:
        pass
    return config
def kmeans(eac, removeTerms, ngram):
    terms = ['lazaro', 'lázaro', 'baez', 'báez', 'carlitos']

    print('Filtering tweets')
    tweets = util.read_from_file("dataset.csv")
    if removeTerms:
        tweets = filter.filter_tweets(tweets, terms_to_remove=terms)
    else:
        tweets = filter.filter_tweets(tweets)

    # Reduce tweets list length
    tweets = tweets[0:6300]

    carlitos = 0
    lazaro = 0

    data = []
    for tw in tweets:
        if tw.tw_type == 'Carlitos':
            carlitos += 1
        else:
            lazaro += 1
        data.append(tw.text)

    print(carlitos, lazaro)

    print("Transform Data...")
    # Transform data
    if ngram:
        hasher = HashingVectorizer(non_negative=True,
                                   ngram_range=(1, 3),
                                   analyzer='word',
                                   norm='l2',
                                   binary=False)
    else:
        hasher = HashingVectorizer(non_negative=True, norm='l2', binary=False)
    vectorizer = make_pipeline(hasher)
    X = vectorizer.fit_transform(data)

    count = 0
    precision_list = []

    while count < 100:

        # Start timer
        t0 = time()

        if eac:

            clustering = EAC(30, min_k=2, max_k=10)
            EAC_D = clustering.fit(X).distance_

            # Kmedoids over EAC_D
            kmed = KMedoids(2, init='random', distance_metric="precomputed")
            labels = kmed.fit(EAC_D).labels_

        else:

            km = KMeans(n_clusters=2, init='k-means++', n_init=1, max_iter=100)
            labels = km.fit(X).labels_

        # Assign labels to tweets
        for i in range(len(tweets)):
            tweets[i].label = labels[i]

        print("Precision: ")
        # Print precision
        precision = util.precision(tweets)

        print("done in %0.3fs" % (time() - t0))

        if isAdable(precision):
            precision_list.append(precision)
            count += 1

    return precision_list
def minhash(eac, shingle, removeTerms):
    terms = ['lazaro', 'lázaro', 'baez', 'báez', 'carlitos']

    print('Filtering tweets')
    tweets = util.read_from_file("dataset.csv")
    if removeTerms:
        tweets = filter.filter_tweets(tweets, terms_to_remove=terms)
    else:
        tweets = filter.filter_tweets(tweets)

    # Reduce tweets list length
    tweets = tweets[0:6300]

    carlitos = 0
    lazaro = 0

    data = []
    for tw in tweets:
        if tw.tw_type == 'Carlitos':
            carlitos += 1
        else:
            lazaro += 1
        data.append(tw.text)

    print(carlitos, lazaro)

    # Extract text from tweets
    X = [tw.text for tw in tweets]

    # Start timer
    t0 = time()

    print("Calculating distance matrix...")
    D = metrics.jaccard_minhash_distance_mp(X, shingle_length=shingle)

    count = 0
    precision_list = []

    while count < 100:

        if eac:

            print("EAC clustering...")
            # EAC clustering
            kmedoid = KMedoids(init='random', distance_metric='precomputed')
            clustering = EAC(30, min_k=2, max_k=10, clustering=kmedoid)
            EAC_D = clustering.fit(D).distance_

            # Kmedoids over EAC_D
            kmed = KMedoids(2, init='random', distance_metric="precomputed")
            labels = kmed.fit(EAC_D).labels_

        else:
            kmedoid = KMedoids(2, init='random', distance_metric='precomputed')

            print("Kmedoids clustering...")
            labels = kmedoid.fit(D).labels_

        # Assign labels to tweets
        for i in range(len(tweets)):
            tweets[i].label = labels[i]

        # Print precision
        print("Precision: ")
        precision = util.precision(tweets)

        if isAdable(precision):
            print(count)
            precision_list.append(precision)
            count += 1

        print("done in %0.3fs" % (time() - t0))

    return precision_list
Esempio n. 6
0
    node = nodes[node_name]
    ip_address = '10.10.20.%i/24' % i
    net_intf = SshJob(node=node,
                      command=[
                          Run('mkdir', '/root/captures', '/root/errors'),
                          Run('ifconfig', 'data', ip_address, 'up')
                      ],
                      required=load_images,
                      scheduler=scheduler)
    net_intfs.append(net_intf)

# -------------

# Installing Distrinet in the client/master node

install_script = read_from_file('install_script.sh')
install = SshJob(node=nodes[master],
                 command=RunString(install_script),
                 required=tuple(net_intfs),
                 scheduler=scheduler)

# -------------

# Loading image tarballs

load_tarballs = SshJob(node=faraday,
                       command=Run('scp', '-o StrictHostKeyChecking=no',
                                   '~/VoD/*.tar.gz', 'root@fit01:'),
                       required=install,
                       scheduler=scheduler)
Esempio n. 7
0
import sys
import csv
from util import read_from_file, get_all_rooms, get_course_attr, get_readable_attrs, print_columns, time_set

times, building, day = sys.argv[1:]
day = day.upper()

search_times = time_set(times)

courses = read_from_file(building)

rooms = get_all_rooms(courses)

whole_day = set(range(0, 1440))

results = []

for room in rooms:
    courses_in_room = [c for c in courses if get_course_attr(c, 'room') == room and day in get_course_attr(c, 'days')]
    busy_times = set()
    for course in courses_in_room:
        busy_times |= time_set(get_course_attr(course, 'rawtime'))
    available_times = whole_day - busy_times
    if search_times <= available_times:
        results.append(room)

print('Found {0} results'.format(len(results)))
results = sorted(results, key=lambda x: x.rjust(99)) # Sort, putting shorter room numbers first
for item in results:
    print(item)