Beispiel #1
0
def removeStopWords(corpus, stopwords):
    '''
    Removes the stopwords from the corpus and returns the corpus without stopwords.
    '''
    stopwords = stopwords.split(',')
    parseList = [word for word in corpus.split() if word.lower() not in stopwords]
    return ' '.join(parseList)
def tokenizingAndStopwords(str):
    text = ''
    for c in str:
        if c in [
                '.', '...', '?', '!', ':', ';', '&', ',', '"', '*', '(', ')',
                '[', ']', '{', '}', '#', '~', '_', '=', '+', '-', '/', '\\'
        ]:
            text += ' ' + c + ' '
        else:
            text += c

    text = re.sub(' +', ' ',
                  text)  # replaces one or more spaces with single space
    text = text.lower().split(' ')

    mf = open('googleStopwords.txt', 'r')  # use Google stopwords
    stopwords = mf.read()
    mf.close()
    stopwords = stopwords.split('\n')
    stopwords = stopwords + [
        '&', '*', '(', ')', '[', ']', '{', '}', '#', '~', '_', '=', '+', '-',
        '\'', '\n'
    ]
    result = []
    for word in text:
        if word not in stopwords:
            result.append(word)

    return result
Beispiel #3
0
def getStopWords():
    stopwordsFile = open('/Users/305015992/pythonProjects/word2vecAnalysis/stopwords.txt', 'r')
    stopwords=stopwordsFile.read()
    #stopwords=stopwords.lower()
    stopwordList=stopwords.split(",")
    print(stopwordList)
    return(stopwordList)
Beispiel #4
0
def sentiment(stopwords, text):
    arr_stopwords = stopwords.split()
    lower_case = text.lower()
    cleaned_text = lower_case.translate(
        str.maketrans('', '', string.punctuation))
    tokenized_words = word_tokenize(cleaned_text, "english")

    final_words = []

    for word in tokenized_words:
        if word not in stopwords.words("english"):
            final_words.append(word)

    emotion_list = []
    with open('emotions.txt', 'r') as file:
        for line in file:
            clear_line = line.replace('\n',
                                      '').replace(',', '').replace("'",
                                                                   '').strip()
            word, emotion = clear_line.split(':')

            if word in final_words:
                emotion_list.append(emotion)
    sentiment_analysis(cleaned_text)
    return emotion_list
Beispiel #5
0
def clean_words(job_type_list,stopwords):
    from nltk.corpus import stopwords
    stopwords = ' '.join(stopwords)
    stopwords = stopwords.translate(str.maketrans('','',string.punctuation)).lower()
    stopwords = stopwords.split(' ')
    stopwords.extend(['food','restaurant','get','place','really','menu','also','one','got','two','us','around','san','francisco','sf','','la','order','ordered','eat','good','come','first','go','even','would','hour','well','time','way','spot','like','make','worth','back','never','seven','close','back','etc','using','including','use',"you'll",'·','job','qualifications','plus','experience','work','working','scientist','science','company','skills','eg','equal','scientists','role','industry','data','engeineer','engineering'])
    special_chars = ['--','...','\n','•','®','●','\n']
    a = ' '.join(job_type_list)
    a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case
    for char in special_chars:
        a = a.replace(char, ' ') #replace special char with a space
    resultwords = [word for word in a.split(' ') if word.lower() not in stopwords]
    return resultwords
def imdb_data_preprocess(inpath, outpath="./", name="imdb_tr.csv", mix=False):
	import pandas as pd 
	from pandas import DataFrame, read_csv
	import os
	import csv 
	import numpy as np 

	stopwords = open("stopwords.en.txt", 'r' , encoding="ISO-8859-1").read()
	stopwords = stopwords.split("\n")

	indices = []
	text = []
	rating = []

	i =  0 

	for filename in os.listdir(inpath+"pos"):
		data = open(inpath+"pos/"+filename, 'r' , encoding="ISO-8859-1").read()
		data = remove_stopwords(data, stopwords)
		indices.append(i)
		text.append(data)
		rating.append("1")
		i = i + 1

	for filename in os.listdir(inpath+"neg"):
		data = open(inpath+"neg/"+filename, 'r' , encoding="ISO-8859-1").read()
		data = remove_stopwords(data, stopwords)
		indices.append(i)
		text.append(data)
		rating.append("0")
		i = i + 1

	Dataset = list(zip(indices,text,rating))
	
	if mix:
		np.random.shuffle(Dataset)

	df = pd.DataFrame(data = Dataset, columns=['row_Number', 'text', 'polarity'])
	df.to_csv(outpath+name, index=False, header=True)

	pass
Beispiel #7
0
ip_rev_string = re.sub("[^A-Za-z" "]+", " ", ip_rev_string).lower()
ip_rev_string = re.sub("[0-9" "]+", " ", ip_rev_string)

# In[7]:

# words that contained in iphone 7 reviews
ip_reviews_words = ip_rev_string.split(" ")

# In[8]:

#stop_words = stopwords.words('english')

with open("Downloads\\stop.txt", "r") as sw:
    stopwords = sw.read()

stopwords = stopwords.split("\n")

# In[9]:

#stp_wrds = stopwords+stop_words

temp = ["this", "is", "awsome", "Data", "Science"]
[i for i in temp if i not in "is"]

ip_reviews_words = [w for w in ip_reviews_words if not w in stopwords]

# In[10]:

# Joinining all the reviews into single paragraph
ip_rev_string = " ".join(ip_reviews_words)
Beispiel #8
0
def ReadStopWordsFile():
    stopwordslist = []
    stopwordsFile = open("stopwords", "r")
    stopwords = stopwordsFile.read()
    stopwords = stopwords.split('\n')
    return stopwords
Beispiel #9
0

a = turn_dict_into_list(global_ds_jobs_descriptions)
len(a)
####################################
#### Stopwords #######
####################################


import nltk
from nltk.corpus import stopwords

stopwords = nltk.corpus.stopwords.words('english')
stopwords = ' '.join(stopwords)
stopwords = stopwords.translate(str.maketrans('','',string.punctuation)).lower()
stopwords = stopwords.split(' ')
stopwords.extend(['food','restaurant','get','place','really','menu','also','one','got','two','us','around','san','francisco','sf','','la','order','ordered','eat','good','come','first','go','even','would','hour','well','time','way','spot','like','make','worth','back','never','seven','close','back','etc','using','including','use',"you'll",'·','job','qualifications','plus','experience','work','working','scientist','science','company','skills','eg','equal','scientists','role','industry','data','engeineer','engineering'])

####################################
#### Top Single words #######
####################################

def clean_words(job_type_list):
    special_chars = ['--','...','\n','•','®','●','\n']
    a = ' '.join(job_type_list)
    a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case
    for char in special_chars:
        a = a.replace(char, ' ') #replace special char with a space
    resultwords = [word for word in a.split(' ') if word.lower() not in stopwords]
    return resultwords
    
# import modules & set up logging
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

#sentences = [['first', 'sentence'], ['second', 'sentence']]
# train word2vec on the two sentences

#reading the sentneces using the line by line method versus reading from the individual files has different computation times
sentenceIterator = gensim.models.word2vec.LineSentence("all_cases.txt")

#stopwords
stopwordsFile = open('stopwords.txt', 'r')
stopwords = stopwordsFile.read()
stopwordList = stopwords.split(",")

##this is used when we read the file
sentences = []
for sentence in sentenceIterator:
    sentences.append(PreprocessDoc2Vec(" ".join(sentence), stopwordList))
    #sentences.append(PreprocessDoc2Vec(sentence,stoplist))

###if you want bigrams
bigram_transformer = gensim.models.Phrases(sentences)
model = gensim.models.Word2Vec(bigram_transformer[sentences],
                               min_count=10,
                               size=100,
                               sg=1,
                               hs=1)
Beispiel #11
0
def get_custom_stopwords(stop_words_file):
    with open(stop_words_file,'r' , encoding='gbk' , newline= '') as f:
        stopwords = f.read()
    stopwords_list = stopwords.split('\n')
    custom_stopwords_list = [i.replace('\r','') for i in stopwords_list]
    return custom_stopwords_list
Beispiel #12
0
def scrape_job_text(url):
    
    
#a = scrape_job_text('https://www.google.com/search?q=data+scientist&ibp=htl;jobs#fpstate=tldetail&htichips=job_family_1:data%20scientist,job_family_1:data%20science,city:O13QqUW2j4Ciw3zdJvuNdg%3D%3D&htidocid=gPQbEIqS6gcFH9aTAAAAAA%3D%3D&htilrad=24.1401&htischips=job_family_1;data%20scientist;data%20science,city;O13QqUW2j4Ciw3zdJvuNdg%3D%3D:Sunnyvale&htivrt=jobs')
#[item for sublist in a for item in sublist]


for url in url_list:
        try:
            master_dict[url] = global_jobs_descriptions[url]
        except:
            master_dict[url] = scrape_job_text(url)
            global_jobs_descriptions[url] = master_dict[url]
    np.save(file_name, global_jobs_descriptions)



























def turn_dict_into_list(global_jobs_descriptions):
    master_list = list(global_jobs_descriptions.values())
    flat_list = [item for sublist in master_list for item in sublist if 'data' in item and 'the' in item and len(item)>1000]
    print(len(flat_list))
    master_job_text = list(set(flat_list))
    print(len(master_job_text))
    return master_job_text







a = turn_dict_into_list(global_de_jobs_descriptions)
len(a)
####################################
#### Stopwords #######
####################################


import nltk
from nltk.corpus import stopwords

stopwords = nltk.corpus.stopwords.words('english')
stopwords = ' '.join(stopwords)
stopwords = stopwords.translate(str.maketrans('','',string.punctuation)).lower()
stopwords = stopwords.split(' ')
stopwords.extend(['food','restaurant','get','place','really','menu','also','one','got','two','us','around','san','francisco','sf','','la','order','ordered','eat','good','come','first','go','even','would','hour','well','time','way','spot','like','make','worth','back','never','seven','close','back','etc','using','including','use',"you'll",'·','job','qualifications','plus','experience','work','working','scientist','science','company','skills','eg','equal','scientists','role','industry','data','engeineer','engineering'])

####################################
#### Top Single words #######
####################################



def clean_words(job_type_list):
    special_chars = ['--','...','\n','•','®','●','\n']
    a = ' '.join(job_type_list)
    a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case
    for char in special_chars:
        a = a.replace(char, ' ') #replace special char with a space
    resultwords = [word for word in a.split(' ') if word.lower() not in stopwords]
    return resultwords
    

def top_words_counter(resultwords,num_reviews):
    counts = Counter(resultwords)
    my_dict = dict(counts)
    sorted_x = sorted(my_dict.items(), key=operator.itemgetter(1),reverse=True)
    try:
        return (sorted_x[0:num_reviews])
    except:
        return("Not enough words")
        


def percentage_word(job_type_list,word):
    num_appear = sum([1 for i in job_type_list if word.lower() in i.lower()])
    total = len(job_type_list)
    return round((num_appear/total)*100,2)

percentage_word(a,'scala')
def compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,word):
    ds = percentage_word(ds_all_jobs_text,word)
    de = percentage_word(de_all_jobs_text,word)
    print(word + ": " + str(ds) + "% DS, " + str(de) + "% DE \n")
    return [word,ds,de]

#####################################################
####Define variables that contain raw job text#######
#####################################################
#ds_all_jobs_text = create_raw_job_text(data_science_url_list)



ds_all_jobs_text = create_raw_job_dict(global_ds_jobs_descriptions,data_science_url_list,'data_science_job_listings.npy')
de_all_jobs_text = create_raw_job_dict(global_de_jobs_descriptions,data_engineer_url_list,'data_engineer_job_listings.npy')






#####################################################
####Clean words #####################################
#####################################################
ds_all_jobs_text_c = clean_words(ds_all_jobs_text)
#print(ds_all_jobs_text)
de_all_jobs_text_c = clean_words(de_all_jobs_text)

#####################################################
####Define DS variables that contain raw job text#######
#####################################################
ds_top_100 = top_words_counter(ds_all_jobs_text_c,100)
print(ds_top_100)

de_top_100 = top_words_counter(de_all_jobs_text_c,100)
print(de_top_100)

#####################################################
####Define DE variables that contain raw job text#######
#####################################################
de_top_100 = top_words_counter(ds_all_jobs_text,100)

compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'machine learning')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'statistic')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'analysis')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'artificial intelligence')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'predictive modeling')

compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'pipeline')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'big data')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'infrastructure')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'aws')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'ETL')











compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'kafka')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'scala')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'spark')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'hive')

compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'SQL')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'noSQL')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'hadoop')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'airflow')

compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'aws')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'redshift')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'EC2')




















compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'')



compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'airflow')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'scikit')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'')


compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'tensorflow')
compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'')


compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'cloud')

compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'o')







print(percentage_word(ds_all_jobs_text,"data"))

print(top_words_counter(ds_all_jobs_text_c,100))

z = top_words_counter(de_all_jobs_text_c,100)
top_100_list = [i[0] for i in z]
print(top_100_list)

top_100_comparison = [compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,word) for word in top_100_list]
print(top_100_comparison)


#####################################################
####Both top lists job text#######
#####################################################
common_words = list(set(ds_top_100)&set(de_top_100))
print(len(common_words))

#####################################################
####Percentage word appears analysis ################
#####################################################
print(percentage_word(ds_all_jobs_text,"data"))
print(percentage_word(de_all_jobs_text,"data"))

print(percentage_word(ds_all_jobs_text,"python"))
print(percentage_word(de_all_jobs_text,"python"))

print(percentage_word(ds_all_jobs_text,"spark"))
print(percentage_word(de_all_jobs_text,"spark"))












from nltk import word_tokenize
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import QuadgramCollocationFinder 

string.punctuation += "’"

def top_words_bicounter(job_type_list):
    special_chars = ['--','...','\n','•','®','·']
    a = ' '.join(job_type_list)
    a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case
    for char in special_chars:
        a = a.replace(char, ' ') #replace special char with a space
    #resultwords = [word for word in a.split(' ') if word.lower() not in stopwords]
    #text = ' '.join(resultwords)
    a
    finder = BigramCollocationFinder.from_words(word_tokenize(a))
    l = []
    for k,v in finder.ngram_fd.items():
        #count += 1
        z = (k,v)
        l.append(z)
    l = sorted(l,key=itemgetter(1),reverse=True)
    return(l[0:300])
            
top_words_bicounter(job_text)


def top_words_tricounter(job_type_list):
    special_chars = ['--','...','\n','•','®','·']
    a = ' '.join(job_type_list)
    a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case
    for char in special_chars:
        a = a.replace(char, ' ') #replace special char with a space
    #resultwords = [word for word in a.split(' ') if word.lower() not in stopwords]
    #text = ' '.join(resultwords)
    a
    finder = TrigramCollocationFinder.from_words(word_tokenize(a))
    l = []
    for k,v in finder.ngram_fd.items():
        #count += 1
        z = (k,v)
        l.append(z)
    l = sorted(l,key=itemgetter(1),reverse=True)
    return(l[0:300])

top_words_tricounter(job_text)

def top_words_quadcounter(job_type_list):
    special_chars = ['--','...','\n','•','®','·']
    a = ' '.join(job_type_list)
    a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case
    for char in special_chars:
        a = a.replace(char, ' ') #replace special char with a space
    #resultwords = [word for word in a.split(' ') if word.lower() not in stopwords]
    #text = ' '.join(resultwords)
    a
    finder = QuadgramCollocationFinder.from_words(word_tokenize(a))
    l = []
    for k,v in finder.ngram_fd.items():
        #count += 1
        z = (k,v)
        l.append(z)
    l = sorted(l,key=itemgetter(1),reverse=True)
    return(l[0:300])
            
top_words_quadcounter(job_text)



special_chars = ['--','...','\n','•','®']
a = ' '.join(job_text)
a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case
for char in special_chars:
    a = a.replace(char, ' ') #replace special char with a space
resultwords = [word for word in a.split(' ') if word.lower() not in stopwords]
text = ' '.join(resultwords)
finder = BigramCollocationFinder.from_words(word_tokenize(text))
for k,v in finder.ngram_fd.items():
    print(k,v)


##deep copy. save a copy.




a = ' '.join(job_text)
a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case
a = a.replace('\n', ' ') #replace \n with a space
a = a.replace('•', ' ')
resultwords = [word for word in a.split(' ') if word.lower() not in stopwords]
    
flat_list = [item for sublist in all_job_text for item in sublist]
flat_list.split()
a = ''.join(flat_list)
sentence = a.split() #turn into a list

from collections import Counter
counts = Counter(a)
counts.most_common(10)
sentence = a

sentence
counts = Counter(sentence)
dict(counts.most_common(30))




#str.replace(“\n”, “”)

##Google selenium locate element by xpath, two attributes
##read more 


####LOOK FOR READ MORE BUTTOM





with open("data_science_jobs_raw_text.txt","rb") as fp:
    job_text = pickle.load(fp)